def load_preprocess(self, database_directory): """ """ conn = sqlite3.connect(database_directory) self.articles_df = dd.read_sql_table('content_data', conn) self.interactions_df = dd.read_sql_table('user_data', conn) df = self.articles_df.merge(self.interactions_df, on='content_id') df['eventStrength'] = df['event_type'].apply( lambda x: self.event_type_strength[x]) df = df.drop_duplicates() df = df.groupby(['person_id', 'content_id', 'title']).sum().reset_index() df['title'] = df['title'].astype('category') df['person_id'] = df['person_id'].astype('category') df['content_id'] = df['content_id'].astype('category') df['personId'] = df['person_id'].cat.codes df['contentId'] = df['content_id'].cat.codes self.content_data = df #Load user wellness score data self.scores_data = dd.read_sql_table('scores_data', conn)
def baixa_enderecos_cnpj_Dask(bpergunta=True): #rodaSo1bloco = True print('INICIANDO baixa_enderecos_cnpj-------------------------', time.ctime()) conBaseCompleta = sqlalchemy.create_engine(f"sqlite:///{camDbSqliteBaseCompleta}") #, execution_options={"sqlite_raw_colnames": True}) #conEnderecoNormalizado = sqlalchemy.create_engine(f"sqlite:///{camDBSaida}", execution_options={"sqlite_raw_colnames": True}) query = ''' create table endereco_aux AS SELECT t.cnpj, cast(t.cnpj_basico as int) as cnpj_basico, situacao_cadastral as situacao, --tipo_logradouro, logradouro, numero, complemento, bairro, (logradouro || ' ' || numero || ' ' || complemento) as logradouroNumeroComplemento, ifnull(tm.descricao,'') as municipio, t.uf FROM estabelecimento t left join municipio tm on tm.codigo=t.municipio limit 3000000 ''' #pode haver empresas fora da base de teste print(time.ctime(), 'criando tabela endereco_aux') conBaseCompleta.execute('DROP TABLE IF EXISTS endereco_aux') conBaseCompleta.execute(query) print(time.ctime(), 'criando tabela endereco_aux. Fim.') #inicio = 0 #kregistros = gstep #numeroDeRegistros = conBaseCompleta.execute('select count(*) from estabelecimento').fetchall()[0][0] conBaseCompleta = None print(time.ctime(), 'dask enderecos') pend = dd.read_sql_table('endereco_aux', f"sqlite:///{camDbSqliteBaseCompleta}", index_col='cnpj_basico') pend['endereco'] = pend['logradouroNumeroComplemento'].apply(normalizaEndereco, meta=('logradouroNumeroComplemento', 'object')) + '-' + pend['municipio'] + '-'+pend['uf'] dftmptable = pend[['cnpj','endereco','situacao']] dftmptable.to_sql('endereco', f"sqlite:///{camDBSaida}", if_exists='append', dtype=sqlalchemy.types.String) print('ROTINA TERMINOU ' + time.ctime())
def extract(table_name, schema_name, column_names, index_col, id): try: # create empty dataframe as model for importing data from sql table to dask dataframe # (for meta argument in read_sql_table method) empty_df = pd.DataFrame(columns=column_names, dtype='object') empty_df[index_col] = pd.to_numeric(empty_df[index_col], errors='coerce') # get number of cores to set npartitions: ncores = psutil.cpu_count(logical=False) logger.warning('ncores used by Dask = %s', ncores) # set dask dataframe index index_dask = sqlalchemy.sql.column(index_col).label("gn_id") # get user table row data as a dask dataframe df = dd.read_sql_table(table=table_name, index_col=index_dask, uri=str(DB.engine.url), schema=schema_name, bytes_per_chunk=100000000) return df except Exception: raise
def read_db(path, table, index_col): """ Args: path: table: index_col: """ engine = sa.create_engine(path) conn = engine.connect() m = sa.MetaData() table = sa.Table(table, m, autoload=True, autoload_with=engine) # conn.execute("create table testtable (uid integer Primary Key, datetime NUM)") # conn.execute("insert into testtable values (1, '2017-08-03 01:11:31')") # print(conn.execute('PRAGMA table_info(testtable)').fetchall()) # conn.close() uid, dt = list(table.columns) q = sa.select([dt.cast(sa.types.String)]).select_from(table) daskDF = dd.read_sql_table(table, path, index_col=index_col, parse_dates={'datetime': '%Y-%m-%d %H:%M:%S'}) return daskDF
def migrate_table_dask(table, table_id, input_schema, output_schema, uri_input, uri_output, npartitions, bytes_per_chunk='256MB', parallel=True, if_exists='append', method='multi'): df = dd.read_sql_table(table=table, uri=uri_input, schema=input_schema, index_col=table_id, npartitions=npartitions, bytes_per_chunk=bytes_per_chunk) # df = df.drop(f'{table_id}__1', axis=1) dd.to_sql(df, uri=uri_output, name=table, schema=output_schema, if_exists=if_exists, parallel=parallel, method=method)
def __init__(self, config: dict = None, config_file_path: str = None): """ Database Class constructor :param config: Dictionary Based configuration. Required keys are "host", "port", "database", "username", "password" :param config_file_path: Also supports file based configuration. The file should be in JSON format with the above keys. """ try: if config is None: if config_file_path is None: if 'EPYODBC_DBCONFIG' not in os.environ: raise Exception( "Server credentials missing! Set EPYODBC_DBCONFIG Environment variable or pass configs/config_file_path to the constructor" ) else: config_file_path = os.environ['EPYODBC_DBCONFIG'] print(f"Loading server config from : {config_file_path}") config = json.load(open(config_file_path)) except Exception as e: print(e) exit(1) assert "host" in config, f"host key missing in config file: {config_file_path}" assert "port" in config, f"port key missing in config file: {config_file_path}" assert "database" in config, f"database key missing in config file: {config_file_path}" assert "username" in config, f"username key missing in config file: {config_file_path}" assert "password" in config, f"password key missing in config file: {config_file_path}" self.host = config["host"] self.port = config["port"] self.database = config["database"] self.username = config["username"] self.password = config["password"] self.conn = self.connect() self.SKIP_TABLES = [ 'MSreplication_options', 'spt_fallback_db', 'spt_fallback_dev', 'spt_fallback_usg', 'spt_monitor', 'trace_xe_action_map', 'trace_xe_event_map' ] self.tables = [] self.index_cols = {} for row in self.conn.cursor().tables(tableType='TABLE'): if row.table_name not in self.SKIP_TABLES: self.tables.append(row.table_name) for table in self.tables: self.index_cols[table] = list( self.conn.cursor().primaryKeys(table))[0][3] for table in self.tables: self.__setattr__( f"{table}_", dd.read_sql_table( table=f"{table}", uri= f'mssql+pyodbc://{config["username"]}:{config["password"]}@{config["host"]}:{config["port"]}/{config["database"]}?DRIVER={{ODBC Driver 17 for SQL Server}};', index_col=self.index_cols[table]))
def upgrade_rtd(self): """ Pull data from database, that is not yet in the local cache. This function seems to work but is not properly tested. """ rtd = self.load_data() len_beginning = len(rtd) print('Rows befor update:', len_beginning) max_date = rtd['ar_pt'].max().compute() - datetime.timedelta(days=2) max_date = max_date.to_pydatetime() print('getting data added since', max_date) from sqlalchemy import Column, DateTime from sqlalchemy import sql from sqlalchemy.dialects import postgresql with get_engine().connect() as connection: query = sql.select([Column(c) for c in self.df_dict] + [Column('hash_id')])\ .where((Column('ar_pt', DateTime) > str(max_date)) | (Column('dp_pt', DateTime) > str(max_date)))\ .select_from(sql.table(Rtd.__tablename__))\ .alias('new_rtd') view_query = 'CREATE OR REPLACE VIEW new_rtd AS {}'\ .format(str(query.compile(dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}))) connection.execute(view_query) new_rtd = dd.read_sql_table('new_rtd', DB_CONNECT_STRING, index_col='hash_id', meta=self.meta, npartitions=20) new_rtd.to_parquet(self.DATA_CACHE_PATH + '_new', engine='pyarrow', schema='infer') new_rtd = dd.read_parquet(self.DATA_CACHE_PATH + '_new', engine='pyarrow') new_rtd = self._parse(new_rtd) new_rtd.to_parquet(self.DATA_CACHE_PATH + '_new', engine='pyarrow', schema='infer') new_rtd = dd.read_parquet(self.DATA_CACHE_PATH + '_new', engine='pyarrow') # Remove changes from rtd that are also present in new_rtd rtd = rtd.loc[~rtd.index.isin(new_rtd.index.compute()), :] rtd = dd.concat([rtd, new_rtd], axis=0, ignore_index=False) # We need to recategorize here, as the categories might grow from int8 to int16 # and then they need to be recalculated. rtd = self._categorize(rtd) rtd.to_parquet(self.DATA_CACHE_PATH, engine='pyarrow', schema='infer') rtd = self.load_data() self._save_encoders(rtd) len_end = len(rtd) print('Rows after getting new data:', len_end) print('Got', len_end - len_beginning, 'new rows') print('Number of dublicate indicies', rtd.index.compute().duplicated(keep='last').sum())
def fetch_dataframe(ytd): #Einlesen der Daten df = dd.read_sql_table("testdaten", 'sqlite:///Kundendaten.db', "Jahr") #Daten reduzieren auf gewünschtes Jahr try: df_YTD = df.loc[ytd].compute() except: df_YTD = df.loc[ytd - 1].compute() print(f"Keine Daten in Jahr {ytd} vorhanden. Lade Vorjahresdaten...") return df_YTD
def read(): database = 'sqlite:///registered_voters.sqlite' registered08 = dd.read_sql_table('ALL', database, index_col='index') registered08.columns = registered08.columns.str.lower() registered08['year'] = 2008 registered08 = registered08.rename(columns={ 'voters': 'Total Voters' }).drop('party', axis='columns') return registered08
def download_rtd(self): """ Pull the Rtd.__tablename__ table from db, parse it and save it on disk. """ with ProgressBar(): rtd = dd.read_sql_table(self.__tablename__, DB_CONNECT_STRING, index_col='hash_id', meta=self.meta, npartitions=200) rtd.to_parquet(self.DATA_CACHE_PATH, engine='pyarrow', schema='infer') # write_metadata_file=False) rtd = dd.read_parquet(self.DATA_CACHE_PATH, engine='pyarrow') rtd = self._parse(rtd) self._save_encoders(rtd) # Save data to parquet. We have to use pyarrow as fastparquet does not support pd.Int64 rtd.to_parquet(self.DATA_CACHE_PATH, engine='pyarrow', schema='infer')
def send_to_s3bucket(file_name, bucket_path, table, table_id, input_schema, uri_input, npartitions, bytes_per_chunk='256MB'): df = dd.read_sql_table(table=table, uri=uri_input, schema=input_schema, index_col=table_id, npartitions=npartitions, bytes_per_chunk=bytes_per_chunk) # df = df.drop(f'{table_id}__1', axis=1) df.to_csv(bucket_path + file_name)
def extract(table_name, schema_name, column_names, index_col, id): # create empty dataframe as model for importing data from sql table to dask dataframe # (for meta argument in read_sql_table method) empty_df = pd.DataFrame(columns=column_names, dtype="object") empty_df[index_col] = pd.to_numeric(empty_df[index_col], errors="coerce") # set dask dataframe index index_dask = sqlalchemy.sql.column(index_col).label("gn_id") query = """ ALTER TABLE {schema_name}.{table_name} ALTER {index_col} TYPE integer USING {index_col}::integer; """ query_nb_row = """ SELECT count(*) FROM {schema_name}.{table_name} """.format(schema_name=schema_name, table_name=table_name) try: DB.session.execute( query.format(schema_name=schema_name, table_name=table_name, index_col=index_col)) DB.session.commit() query = DB.session.execute(query_nb_row).fetchone() nb_row = query[0] npartition = 1 if nb_row < 50000 else 2 except Exception as e: DB.session.rollback() # get user table row data as a dask dataframe df = dd.read_sql_table( table=table_name, index_col=index_dask, uri=str(DB.engine.url), schema=schema_name, # bytes_per_chunk=100000000, npartitions=npartition, ) return df
def out_of_core_execute(self, timestamp_column, f, blocksize=268435456, **kwargs): query = f(**kwargs) timestamps = [[str(date_range[1]), str(date_range[2])] for date_range in self.__get_time_splits(query, 1)][0] columns = re.findall(self.col_pattern, query)[0].replace(' ', '').split(',') table = re.findall(self.table_pattern, query)[0].split('.')[1] df = dd.read_sql_table(table=table, divisions=timestamps, uri=self.uri, columns=columns, index_col=timestamp_column, bytes_per_chunk=blocksize) return df
def read_tables(engine, table_names): """ Get pandas dataframes for all tables in table_names """ logging.debug("reading SQL tables %s", table_names) all_tables = {} if USE_DASK: for name in table_names: class_ = get_table_name_from_class(name) p_keys = [key.name for key in inspect(class_).primary_key][0] table = dd.read_sql_table(table=name, uri=engine.url, index_col=p_keys) all_tables[name] = table logging.debug("df %s has columns %s", name, table.columns) else: for name in table_names: table = pd.read_sql_table(name, engine) all_tables[name] = table logging.debug("df %s has columns %s", name, table.columns) return all_tables
def dataframe_loader(_context, config): file_type, file_options = list(config.items())[0] path = file_options.get("path") if file_type == "csv": return dd.read_csv(path, **dict_without_keys(file_options, "path")) elif file_type == "parquet": return dd.read_parquet(path, **dict_without_keys(file_options, "path")) elif file_type == "hdf": return dd.read_hdf(path, **dict_without_keys(file_options, "path")) elif file_type == "json": return dd.read_json(path, **dict_without_keys(file_options, "path")) elif file_type == "sql_table": return dd.read_sql_table(**file_options) elif file_type == "table": return dd.read_table(path, **dict_without_keys(file_options, "path")) elif file_type == "fwf": return dd.read_fwf(path, **dict_without_keys(file_options, "path")) elif file_type == "orc": return dd.read_orc(path, **dict_without_keys(file_options, "path")) else: raise DagsterInvariantViolationError( "Unsupported file_type {file_type}".format(file_type=file_type))
def dataframe_loader(_context, config): file_type, file_options = list(config.items())[0] path = file_options.get('path') if file_type == 'csv': return dd.read_csv(path, **dict_without_keys(file_options, 'path')) elif file_type == 'parquet': return dd.read_parquet(path, **dict_without_keys(file_options, 'path')) elif file_type == 'hdf': return dd.read_hdf(path, **dict_without_keys(file_options, 'path')) elif file_type == 'json': return dd.read_json(path, **dict_without_keys(file_options, 'path')) elif file_type == 'sql_table': return dd.read_sql_table(**file_options) elif file_type == 'table': return dd.read_table(path, **dict_without_keys(file_options, 'path')) elif file_type == 'fwf': return dd.read_fwf(path, **dict_without_keys(file_options, 'path')) elif file_type == 'orc': return dd.read_orc(path, **dict_without_keys(file_options, 'path')) else: raise DagsterInvariantViolationError( 'Unsupported file_type {file_type}'.format(file_type=file_type))
import dask.dataframe as dd approx_MB_per_chunk = 256 divisions = list('abcdefghijklmnopqrstuvwz') bytes_per_chunk = approx_MB_per_chunk * 2 ** 20 print("Memory consumptions will be around %.2f GB; %.0f MB per chunk on %.0f chunks." % ( (approx_MB_per_chunk * len(divisions))/1024, approx_MB_per_chunk, len(divisions) )) df = dd.read_sql_table(table=tbl_name, uri=self.db.get_create_engine_string(), # This returns an sqlalchemy create engine scheme index_col='symbol', divisions=divisions, columns=columns, bytes_per_chunk=bytes_per_chunk) print("Finished read_sql_table... (%s)" % tbl_name) df = df.set_index('timestamp') # set the index to make some operations fast print("Finished set_index... (%s)" % 'timestamp')
def fetch_dataframe(): df = dd.read_sql_table("testdaten", 'sqlite:///Kundendaten.db', "index") df.set_index(df.Datum) return df
import dash_core_components as dcc import dash_html_components as html import dash_bootstrap_components as dbc from dash.dependencies import Input, Output from app import app import plotly.graph_objects as go import dask.dataframe as dd import plotly.express as px #Import der Testdaten aus der Kundendatenbank als Dask-Dataframe df = dd.read_sql_table("testdaten", 'sqlite:///Kundendaten.db', "index") df = df.compute() #Vorbereitung des Dataframes, um die Kennzahlen "Kaufwahrscheinlichkeit in %" und "Gewinn pro Verkauf in €" zu generieren, #die die Achsen der BCG-MAtrix darstellen df_BCG_1 = df.groupby(["Angebotenes Produkt"])[["Gewinn", "Anzahl"]].sum().reset_index() df_BCG_2 = df.groupby(["Angebotenes Produkt" ])["Anzahl"].apply(lambda x: x.sum() / x.count()) df_BCG = df_BCG_1.merge(df_BCG_2, on="Angebotenes Produkt") df_BCG = df_BCG.rename(columns={ "Anzahl_x": "Anzahl", "Anzahl_y": "Kaufwahrscheinlichkeit" }) df_BCG["Kaufwahrscheinlichkeit in %"] = df_BCG["Kaufwahrscheinlichkeit"] * 100 df_BCG["Gewinn pro Verkauf in €"] = df_BCG["Gewinn"] / df_BCG["Anzahl"] # Scatter-Plot erstellen nach den Variablen "Kaufwahrscheinlichkeit in %" und "Gewinn pro Verkauf in €" pro Produkt fig = px.scatter(df_BCG, x=df_BCG["Kaufwahrscheinlichkeit in %"],
def read_raw_data(npartitions: int) -> dd.DataFrame: logger.info('Reading raw data into Dask') return dd.read_sql_table('expunge', DATABASE_URI, index_col='id', npartitions=npartitions)
from revoscalepy.utils.RxUtils import rx_print_header import dask.dataframe as dd from config import CONNECTION_STRING from urllib.parse import quote_plus import os #### # Read a SQL table into a dask dataframe (data chunked on disk) #### print("Reading from SQL table into dask df...") SQLALCHEMY_DATABASE_URI = "mssql+pyodbc:///?odbc_connect=%s" % quote_plus( CONNECTION_STRING) df = dd.read_sql_table('Lead_Demography_Tbl_WithID', SQLALCHEMY_DATABASE_URI, index_col='Lead_Id') print(df.columns) #### # Write to a file (to_csv) #### ### # Read a SQL table (in-place in the DB!) with revoscalepy ### print("Reading from SQL table in-place with revoscalepy...") compute_context = RxInSqlServer(connectionString=CONNECTION_STRING,
def _load(self): import dask.dataframe as dd self._dataframe = dd.read_sql_table(self._sql_expr, self._uri, self._index, **self._sql_kwargs)
import os import dask.dataframe as dd from contexttimer import Timer from docopt import docopt from dask.distributed import Client, LocalCluster if __name__ == "__main__": args = docopt(__doc__, version="Naval Fate 2.0") conn = os.environ["POSTGRES_URL"] table = os.environ["POSTGRES_TABLE"] npartition = int(args["<num>"]) cluster = LocalCluster(n_workers=npartition, scheduler_port=0, memory_limit="230G") client = Client(cluster) with Timer() as timer: df = dd.read_sql_table( table, conn, "l_orderkey", npartitions=npartition, limits=(0, 60000000), ).compute() print(f"[Total] {timer.elapsed:.2f}s") print(df.head())
import dask import dask.dataframe as DataFrame from dask.diagnostics import ProgressBar # from multiprocessing.pool import Pool from multiprocessing.pool import ThreadPool dask.config.set(scheduler='threads') # dask.config.set(pool=Pool(5)) dask.config.set(pool=ThreadPool(5)) df: DataFrame = DataFrame.read_sql_table("products", "postgresql://tilak@localhost:5432/datamart", "region_code") print(df) with ProgressBar(): df.to_parquet("/tmp/products")
feature_json = spotify_audio_features(final_tracks) feature_df = parse_multiple_features(feature_json) merged_df = track_df.merge(feature_df, how="inner", left_on="track_id", right_on="id") dfs.append(merged_df) #Concatenate all the dataframes, ignore the index, and drop duplicates created in the last step return pd.concat(dfs, ignore_index=True).drop_duplicates(keep="first").reset_index(drop=True) # %% from dask.distributed import Client db_path = "sqlite:///last_fm.db" client = Client(n_workers = 1, threads_per_worker=16, memory_limit="25GB", processes=False) client feature_ddf = dd.read_sql_table("FEATURES", db_path, "index", npartitions=10) genre_track_ddf = dd.read_sql_table("TRAINING_DB_2", db_path, "index", npartitions=10) genre_track_ddf = genre_track_ddf.loc[~genre_track_ddf['track_id'].isna()].set_index("track_id") # %% genre_feature_ddf = feature_ddf.merge(genre_track_ddf, on=func_dict.df_columns.track_id).set_index("track_id") # %% from dask_ml.preprocessing import Categorizer, DummyEncoder from sklearn.pipeline import make_pipeline pipe = make_pipeline( Categorizer(), DummyEncoder()
) raise ValueError( "A very specific bad thing happened, chase up the error tree to find out." ) return sqlDF def sqltoDF2(query, path, database, server): query = SQL file (query.SQL) path = location of SQL file database = database () server = server () example useage sqltoDF(query='query.sql',path='../../src/', database = '', server='') '''''' conn = create_engine(f'mssql+pyodbc://{database}/{server}?driver=SQL+Server') query = open(path + query) sqlDF = pd.read_sql_query(query.read(), conn) return sqlDF #for dask use URI server = '' database = '' username = "" password = "" params = urllib.parse.quote_plus("DRIVER={xxxx};SERVER=" + server + ";DATABASE=" + database) uri=("xxx+pyodbc:///?odbc_connect=%s" % params) df = dd.read_sql_table("Value", uri=uri, npartitions=10, index_col="ID", schema='', head_rows=5)