Ejemplos de read_sql_table en Python, ejemplos de dask.dataframe.read_sql_table en Python

Ejemplo n.º 1

0

Mostrar archivo

    def load_preprocess(self, database_directory):
        """
        """

        conn = sqlite3.connect(database_directory)

        self.articles_df = dd.read_sql_table('content_data', conn)
        self.interactions_df = dd.read_sql_table('user_data', conn)

        df = self.articles_df.merge(self.interactions_df, on='content_id')
        df['eventStrength'] = df['event_type'].apply(
            lambda x: self.event_type_strength[x])

        df = df.drop_duplicates()
        df = df.groupby(['person_id', 'content_id',
                         'title']).sum().reset_index()

        df['title'] = df['title'].astype('category')
        df['person_id'] = df['person_id'].astype('category')
        df['content_id'] = df['content_id'].astype('category')
        df['personId'] = df['person_id'].cat.codes
        df['contentId'] = df['content_id'].cat.codes

        self.content_data = df

        #Load user wellness score data

        self.scores_data = dd.read_sql_table('scores_data', conn)

Ejemplo n.º 2

0

Mostrar archivo

def baixa_enderecos_cnpj_Dask(bpergunta=True):
    #rodaSo1bloco = True
    print('INICIANDO baixa_enderecos_cnpj-------------------------', time.ctime())
    conBaseCompleta = sqlalchemy.create_engine(f"sqlite:///{camDbSqliteBaseCompleta}") #, execution_options={"sqlite_raw_colnames": True})
    #conEnderecoNormalizado = sqlalchemy.create_engine(f"sqlite:///{camDBSaida}", execution_options={"sqlite_raw_colnames": True})
    query = '''
                create table endereco_aux AS
                SELECT t.cnpj, cast(t.cnpj_basico as int) as cnpj_basico,
                situacao_cadastral as situacao,
                --tipo_logradouro, logradouro, numero, complemento, bairro,
                (logradouro || ' ' || numero || ' ' || complemento) as logradouroNumeroComplemento,
                ifnull(tm.descricao,'') as municipio, t.uf
                FROM estabelecimento t 
                left join municipio tm on tm.codigo=t.municipio
                limit 3000000
                
            ''' #pode haver empresas fora da base de teste            
    print(time.ctime(), 'criando tabela endereco_aux')    
    conBaseCompleta.execute('DROP TABLE IF EXISTS endereco_aux')    
    conBaseCompleta.execute(query)
    print(time.ctime(), 'criando tabela endereco_aux. Fim.') 
    #inicio = 0
    #kregistros = gstep
    #numeroDeRegistros = conBaseCompleta.execute('select count(*) from estabelecimento').fetchall()[0][0]
    conBaseCompleta = None

    print(time.ctime(), 'dask enderecos')
    pend = dd.read_sql_table('endereco_aux', f"sqlite:///{camDbSqliteBaseCompleta}", 
                             index_col='cnpj_basico')
    pend['endereco'] = pend['logradouroNumeroComplemento'].apply(normalizaEndereco, meta=('logradouroNumeroComplemento', 'object')) +  '-' + pend['municipio'] + '-'+pend['uf']
    dftmptable = pend[['cnpj','endereco','situacao']]
    dftmptable.to_sql('endereco', f"sqlite:///{camDBSaida}", if_exists='append', dtype=sqlalchemy.types.String)
    print('ROTINA TERMINOU ' + time.ctime())

Ejemplo n.º 3

0

Mostrar archivo

def extract(table_name, schema_name, column_names, index_col, id):

    try:

        # create empty dataframe as model for importing data from sql table to dask dataframe
        # (for meta argument in read_sql_table method)
        empty_df = pd.DataFrame(columns=column_names, dtype='object')
        empty_df[index_col] = pd.to_numeric(empty_df[index_col],
                                            errors='coerce')

        # get number of cores to set npartitions:
        ncores = psutil.cpu_count(logical=False)
        logger.warning('ncores used by Dask = %s', ncores)

        # set dask dataframe index
        index_dask = sqlalchemy.sql.column(index_col).label("gn_id")

        # get user table row data as a dask dataframe
        df = dd.read_sql_table(table=table_name,
                               index_col=index_dask,
                               uri=str(DB.engine.url),
                               schema=schema_name,
                               bytes_per_chunk=100000000)

        return df

    except Exception:
        raise

Ejemplo n.º 4

0

Mostrar archivo

def read_db(path, table, index_col):
    """
    Args:
        path:
        table:
        index_col:
    """
    engine = sa.create_engine(path)
    conn = engine.connect()
    m = sa.MetaData()
    table = sa.Table(table, m, autoload=True, autoload_with=engine)

    # conn.execute("create table testtable (uid integer Primary Key, datetime NUM)")
    # conn.execute("insert into testtable values (1, '2017-08-03 01:11:31')")
    # print(conn.execute('PRAGMA table_info(testtable)').fetchall())
    # conn.close()

    uid, dt = list(table.columns)
    q = sa.select([dt.cast(sa.types.String)]).select_from(table)

    daskDF = dd.read_sql_table(table,
                               path,
                               index_col=index_col,
                               parse_dates={'datetime': '%Y-%m-%d %H:%M:%S'})
    return daskDF

Ejemplo n.º 5

0

Mostrar archivo

Archivo: migration.py Proyecto: JohnHolz/jh_utils

def migrate_table_dask(table,
                       table_id,
                       input_schema,
                       output_schema,
                       uri_input,
                       uri_output,
                       npartitions,
                       bytes_per_chunk='256MB',
                       parallel=True,
                       if_exists='append',
                       method='multi'):
    df = dd.read_sql_table(table=table,
                           uri=uri_input,
                           schema=input_schema,
                           index_col=table_id,
                           npartitions=npartitions,
                           bytes_per_chunk=bytes_per_chunk)
    # df = df.drop(f'{table_id}__1', axis=1)
    dd.to_sql(df,
              uri=uri_output,
              name=table,
              schema=output_schema,
              if_exists=if_exists,
              parallel=parallel,
              method=method)

Ejemplo n.º 6

0

Mostrar archivo

    def __init__(self, config: dict = None, config_file_path: str = None):
        """
        Database Class constructor

        :param config: Dictionary Based configuration. Required keys are "host", "port", "database", "username", "password"
        :param config_file_path: Also supports file based configuration. The file should be in JSON format with the above keys.
        """
        try:
            if config is None:
                if config_file_path is None:
                    if 'EPYODBC_DBCONFIG' not in os.environ:
                        raise Exception(
                            "Server credentials missing! Set EPYODBC_DBCONFIG Environment variable or pass configs/config_file_path to the constructor"
                        )
                    else:
                        config_file_path = os.environ['EPYODBC_DBCONFIG']
                print(f"Loading server config from : {config_file_path}")
                config = json.load(open(config_file_path))
        except Exception as e:
            print(e)
            exit(1)

        assert "host" in config, f"host key missing in config file: {config_file_path}"
        assert "port" in config, f"port key missing in config file: {config_file_path}"
        assert "database" in config, f"database key missing in config file: {config_file_path}"
        assert "username" in config, f"username key missing in config file: {config_file_path}"
        assert "password" in config, f"password key missing in config file: {config_file_path}"

        self.host = config["host"]
        self.port = config["port"]
        self.database = config["database"]
        self.username = config["username"]
        self.password = config["password"]
        self.conn = self.connect()
        self.SKIP_TABLES = [
            'MSreplication_options', 'spt_fallback_db', 'spt_fallback_dev',
            'spt_fallback_usg', 'spt_monitor', 'trace_xe_action_map',
            'trace_xe_event_map'
        ]
        self.tables = []
        self.index_cols = {}
        for row in self.conn.cursor().tables(tableType='TABLE'):
            if row.table_name not in self.SKIP_TABLES:
                self.tables.append(row.table_name)
        for table in self.tables:
            self.index_cols[table] = list(
                self.conn.cursor().primaryKeys(table))[0][3]
        for table in self.tables:
            self.__setattr__(
                f"{table}_",
                dd.read_sql_table(
                    table=f"{table}",
                    uri=
                    f'mssql+pyodbc://{config["username"]}:{config["password"]}@{config["host"]}:{config["port"]}/{config["database"]}?DRIVER={{ODBC Driver 17 for SQL Server}};',
                    index_col=self.index_cols[table]))

Ejemplo n.º 7

0

Mostrar archivo

Archivo: RtdRay.py Proyecto: TrAInConnectionPrediction/tcp

    def upgrade_rtd(self):
        """
        Pull data from database, that is not yet in the local cache.
        This function seems to work but is not properly tested.
        """
        rtd = self.load_data()
        len_beginning = len(rtd)
        print('Rows befor update:', len_beginning)
        max_date = rtd['ar_pt'].max().compute() - datetime.timedelta(days=2)
        max_date = max_date.to_pydatetime()
        print('getting data added since', max_date)

        from sqlalchemy import Column, DateTime
        from sqlalchemy import sql
        from sqlalchemy.dialects import postgresql

        with get_engine().connect() as connection:
            query = sql.select([Column(c) for c in self.df_dict] + [Column('hash_id')])\
                .where((Column('ar_pt', DateTime) > str(max_date)) | (Column('dp_pt', DateTime) > str(max_date)))\
                .select_from(sql.table(Rtd.__tablename__))\
                .alias('new_rtd')
            view_query = 'CREATE OR REPLACE VIEW new_rtd AS {}'\
                         .format(str(query.compile(dialect=postgresql.dialect(),
                                                   compile_kwargs={"literal_binds": True})))
            connection.execute(view_query)
            new_rtd = dd.read_sql_table('new_rtd', DB_CONNECT_STRING,
                                        index_col='hash_id', meta=self.meta, npartitions=20)

            new_rtd.to_parquet(self.DATA_CACHE_PATH + '_new', engine='pyarrow', schema='infer') 
        new_rtd = dd.read_parquet(self.DATA_CACHE_PATH + '_new', engine='pyarrow')

        new_rtd = self._parse(new_rtd)
        
        new_rtd.to_parquet(self.DATA_CACHE_PATH + '_new', engine='pyarrow', schema='infer')
        new_rtd = dd.read_parquet(self.DATA_CACHE_PATH + '_new', engine='pyarrow')

        
        # Remove changes from rtd that are also present in new_rtd
        rtd = rtd.loc[~rtd.index.isin(new_rtd.index.compute()), :]

        rtd = dd.concat([rtd, new_rtd], axis=0, ignore_index=False)
        
        # We need to recategorize here, as the categories might grow from int8 to int16
        # and then they need to be recalculated.
        rtd = self._categorize(rtd)
        rtd.to_parquet(self.DATA_CACHE_PATH, engine='pyarrow', schema='infer')


        rtd = self.load_data()
        self._save_encoders(rtd)

        len_end = len(rtd)
        print('Rows after getting new data:', len_end)
        print('Got', len_end - len_beginning, 'new rows')
        print('Number of dublicate indicies', rtd.index.compute().duplicated(keep='last').sum())

Ejemplo n.º 8

0

Mostrar archivo

Archivo: dashboard_kpi.py Proyecto: Sabokou/Fallstudie

def fetch_dataframe(ytd):
    #Einlesen der Daten
    df = dd.read_sql_table("testdaten", 'sqlite:///Kundendaten.db', "Jahr")

    #Daten reduzieren auf gewünschtes Jahr
    try:
        df_YTD = df.loc[ytd].compute()
    except:
        df_YTD = df.loc[ytd - 1].compute()
        print(f"Keine Daten in Jahr {ytd} vorhanden. Lade Vorjahresdaten...")
    return df_YTD

Ejemplo n.º 9

0

Mostrar archivo

def read():

    database = 'sqlite:///registered_voters.sqlite'

    registered08 = dd.read_sql_table('ALL', database, index_col='index')
    registered08.columns = registered08.columns.str.lower()

    registered08['year'] = 2008
    registered08 = registered08.rename(columns={
        'voters': 'Total Voters'
    }).drop('party', axis='columns')

    return registered08

Ejemplo n.º 10

0

Mostrar archivo

Archivo: RtdRay.py Proyecto: TrAInConnectionPrediction/tcp

    def download_rtd(self):
        """
        Pull the Rtd.__tablename__ table from db, parse it and save it on disk.
        """
        with ProgressBar():
            rtd = dd.read_sql_table(self.__tablename__, DB_CONNECT_STRING,
                                    index_col='hash_id', meta=self.meta, npartitions=200)
            rtd.to_parquet(self.DATA_CACHE_PATH, engine='pyarrow', schema='infer') # write_metadata_file=False)
            rtd = dd.read_parquet(self.DATA_CACHE_PATH, engine='pyarrow')

            rtd = self._parse(rtd)
            self._save_encoders(rtd)

            # Save data to parquet. We have to use pyarrow as fastparquet does not support pd.Int64
            rtd.to_parquet(self.DATA_CACHE_PATH, engine='pyarrow', schema='infer')

Ejemplo n.º 11

0

Mostrar archivo

Archivo: migration.py Proyecto: JohnHolz/jh_utils

def send_to_s3bucket(file_name,
                     bucket_path,
                     table,
                     table_id,
                     input_schema,
                     uri_input,
                     npartitions,
                     bytes_per_chunk='256MB'):
    df = dd.read_sql_table(table=table,
                           uri=uri_input,
                           schema=input_schema,
                           index_col=table_id,
                           npartitions=npartitions,
                           bytes_per_chunk=bytes_per_chunk)
    # df = df.drop(f'{table_id}__1', axis=1)
    df.to_csv(bucket_path + file_name)

Ejemplo n.º 12

0

Mostrar archivo

def extract(table_name, schema_name, column_names, index_col, id):

    # create empty dataframe as model for importing data from sql table to dask dataframe
    # (for meta argument in read_sql_table method)
    empty_df = pd.DataFrame(columns=column_names, dtype="object")
    empty_df[index_col] = pd.to_numeric(empty_df[index_col], errors="coerce")

    # set dask dataframe index
    index_dask = sqlalchemy.sql.column(index_col).label("gn_id")
    query = """
    ALTER TABLE {schema_name}.{table_name}
    ALTER {index_col} TYPE integer
    USING {index_col}::integer;
    """
    query_nb_row = """
    SELECT count(*) 
    FROM {schema_name}.{table_name}
    """.format(schema_name=schema_name, table_name=table_name)
    try:
        DB.session.execute(
            query.format(schema_name=schema_name,
                         table_name=table_name,
                         index_col=index_col))
        DB.session.commit()

        query = DB.session.execute(query_nb_row).fetchone()
        nb_row = query[0]
        npartition = 1 if nb_row < 50000 else 2
    except Exception as e:
        DB.session.rollback()

    # get user table row data as a dask dataframe
    df = dd.read_sql_table(
        table=table_name,
        index_col=index_dask,
        uri=str(DB.engine.url),
        schema=schema_name,
        # bytes_per_chunk=100000000,
        npartitions=npartition,
    )

    return df

Ejemplo n.º 13

0

Mostrar archivo

Archivo: clickpandas.py Proyecto: andrewspb26/clickpandas

    def out_of_core_execute(self,
                            timestamp_column,
                            f,
                            blocksize=268435456,
                            **kwargs):

        query = f(**kwargs)
        timestamps = [[str(date_range[1]),
                       str(date_range[2])]
                      for date_range in self.__get_time_splits(query, 1)][0]
        columns = re.findall(self.col_pattern, query)[0].replace(' ',
                                                                 '').split(',')
        table = re.findall(self.table_pattern, query)[0].split('.')[1]
        df = dd.read_sql_table(table=table,
                               divisions=timestamps,
                               uri=self.uri,
                               columns=columns,
                               index_col=timestamp_column,
                               bytes_per_chunk=blocksize)

        return df

Ejemplo n.º 14

0

Mostrar archivo

Archivo: sql_tables.py Proyecto: corcorf/flannflix

def read_tables(engine, table_names):
    """
    Get pandas dataframes for all tables in table_names
    """
    logging.debug("reading SQL tables %s", table_names)
    all_tables = {}
    if USE_DASK:
        for name in table_names:
            class_ = get_table_name_from_class(name)
            p_keys = [key.name for key in inspect(class_).primary_key][0]
            table = dd.read_sql_table(table=name,
                                      uri=engine.url,
                                      index_col=p_keys)
            all_tables[name] = table
            logging.debug("df %s has columns %s", name, table.columns)
    else:
        for name in table_names:
            table = pd.read_sql_table(name, engine)
            all_tables[name] = table
            logging.debug("df %s has columns %s", name, table.columns)

    return all_tables

Ejemplo n.º 15

0

Mostrar archivo

Archivo: data_frame.py Proyecto: mikaylaedwards/dagster

def dataframe_loader(_context, config):
    file_type, file_options = list(config.items())[0]
    path = file_options.get("path")

    if file_type == "csv":
        return dd.read_csv(path, **dict_without_keys(file_options, "path"))
    elif file_type == "parquet":
        return dd.read_parquet(path, **dict_without_keys(file_options, "path"))
    elif file_type == "hdf":
        return dd.read_hdf(path, **dict_without_keys(file_options, "path"))
    elif file_type == "json":
        return dd.read_json(path, **dict_without_keys(file_options, "path"))
    elif file_type == "sql_table":
        return dd.read_sql_table(**file_options)
    elif file_type == "table":
        return dd.read_table(path, **dict_without_keys(file_options, "path"))
    elif file_type == "fwf":
        return dd.read_fwf(path, **dict_without_keys(file_options, "path"))
    elif file_type == "orc":
        return dd.read_orc(path, **dict_without_keys(file_options, "path"))
    else:
        raise DagsterInvariantViolationError(
            "Unsupported file_type {file_type}".format(file_type=file_type))

Ejemplo n.º 16

0

Mostrar archivo

def dataframe_loader(_context, config):
    file_type, file_options = list(config.items())[0]
    path = file_options.get('path')

    if file_type == 'csv':
        return dd.read_csv(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'parquet':
        return dd.read_parquet(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'hdf':
        return dd.read_hdf(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'json':
        return dd.read_json(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'sql_table':
        return dd.read_sql_table(**file_options)
    elif file_type == 'table':
        return dd.read_table(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'fwf':
        return dd.read_fwf(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'orc':
        return dd.read_orc(path, **dict_without_keys(file_options, 'path'))
    else:
        raise DagsterInvariantViolationError(
            'Unsupported file_type {file_type}'.format(file_type=file_type))

Ejemplo n.º 17

0

Mostrar archivo

Archivo: dask_read_from_postgresql_with_memory_restrictions.py Proyecto: kuchynkap/Cheats

import dask.dataframe as dd


        approx_MB_per_chunk = 256
        divisions = list('abcdefghijklmnopqrstuvwz')

        bytes_per_chunk = approx_MB_per_chunk * 2 ** 20
        print("Memory consumptions will be around %.2f GB; %.0f MB per chunk on %.0f chunks." % (
            (approx_MB_per_chunk * len(divisions))/1024, approx_MB_per_chunk, len(divisions)
        ))
        df = dd.read_sql_table(table=tbl_name,
                               uri=self.db.get_create_engine_string(), # This returns an sqlalchemy create engine scheme
                               index_col='symbol',
                               divisions=divisions,
                               columns=columns,
                               bytes_per_chunk=bytes_per_chunk)

        print("Finished read_sql_table... (%s)" % tbl_name)
        df = df.set_index('timestamp')  # set the index to make some operations fast
        print("Finished set_index... (%s)" % 'timestamp')

Ejemplo n.º 18

0

Mostrar archivo

Archivo: dashboard_zeit.py Proyecto: Sabokou/Fallstudie

def fetch_dataframe():
    df = dd.read_sql_table("testdaten", 'sqlite:///Kundendaten.db', "index")
    df.set_index(df.Datum)
    return df

Ejemplo n.º 19

0

Mostrar archivo

import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output
from app import app
import plotly.graph_objects as go
import dask.dataframe as dd
import plotly.express as px

#Import der Testdaten aus der Kundendatenbank als Dask-Dataframe
df = dd.read_sql_table("testdaten", 'sqlite:///Kundendaten.db', "index")
df = df.compute()

#Vorbereitung des Dataframes, um die Kennzahlen "Kaufwahrscheinlichkeit in %" und "Gewinn pro Verkauf in €" zu generieren,
#die die Achsen der BCG-MAtrix darstellen
df_BCG_1 = df.groupby(["Angebotenes Produkt"])[["Gewinn",
                                                "Anzahl"]].sum().reset_index()
df_BCG_2 = df.groupby(["Angebotenes Produkt"
                       ])["Anzahl"].apply(lambda x: x.sum() / x.count())

df_BCG = df_BCG_1.merge(df_BCG_2, on="Angebotenes Produkt")
df_BCG = df_BCG.rename(columns={
    "Anzahl_x": "Anzahl",
    "Anzahl_y": "Kaufwahrscheinlichkeit"
})
df_BCG["Kaufwahrscheinlichkeit in %"] = df_BCG["Kaufwahrscheinlichkeit"] * 100
df_BCG["Gewinn pro Verkauf in €"] = df_BCG["Gewinn"] / df_BCG["Anzahl"]

# Scatter-Plot erstellen nach den Variablen "Kaufwahrscheinlichkeit in %" und "Gewinn pro Verkauf in €" pro Produkt
fig = px.scatter(df_BCG,
                 x=df_BCG["Kaufwahrscheinlichkeit in %"],

Ejemplo n.º 20

0

Mostrar archivo

Archivo: charges.py Proyecto: code-for-charlottesville/LAJC-expungement

def read_raw_data(npartitions: int) -> dd.DataFrame:
    logger.info('Reading raw data into Dask')
    return dd.read_sql_table('expunge',
                             DATABASE_URI,
                             index_col='id',
                             npartitions=npartitions)

Ejemplo n.º 21

0

Mostrar archivo

from revoscalepy.utils.RxUtils import rx_print_header
import dask.dataframe as dd
from config import CONNECTION_STRING
from urllib.parse import quote_plus
import os

####
# Read a SQL table into a dask dataframe (data chunked on disk)
####

print("Reading from SQL table into dask df...")

SQLALCHEMY_DATABASE_URI = "mssql+pyodbc:///?odbc_connect=%s" % quote_plus(
    CONNECTION_STRING)
df = dd.read_sql_table('Lead_Demography_Tbl_WithID',
                       SQLALCHEMY_DATABASE_URI,
                       index_col='Lead_Id')

print(df.columns)

####
# Write to a file (to_csv)
####

###
# Read a SQL table (in-place in the DB!) with revoscalepy
###

print("Reading from SQL table in-place with revoscalepy...")

compute_context = RxInSqlServer(connectionString=CONNECTION_STRING,

Ejemplo n.º 22

0

Mostrar archivo

Archivo: intake_sql.py Proyecto: aweir12/intake-sql

 def _load(self):
     import dask.dataframe as dd
     self._dataframe = dd.read_sql_table(self._sql_expr, self._uri,
                                         self._index, **self._sql_kwargs)

Ejemplo n.º 23

0

Mostrar archivo

import os

import dask.dataframe as dd
from contexttimer import Timer
from docopt import docopt
from dask.distributed import Client, LocalCluster

if __name__ == "__main__":
    args = docopt(__doc__, version="Naval Fate 2.0")
    conn = os.environ["POSTGRES_URL"]
    table = os.environ["POSTGRES_TABLE"]
    npartition = int(args["<num>"])
    cluster = LocalCluster(n_workers=npartition,
                           scheduler_port=0,
                           memory_limit="230G")
    client = Client(cluster)

    with Timer() as timer:
        df = dd.read_sql_table(
            table,
            conn,
            "l_orderkey",
            npartitions=npartition,
            limits=(0, 60000000),
        ).compute()

    print(f"[Total] {timer.elapsed:.2f}s")

    print(df.head())

Ejemplo n.º 24

0

Mostrar archivo

import dask
import dask.dataframe as DataFrame
from dask.diagnostics import ProgressBar
# from multiprocessing.pool import Pool
from multiprocessing.pool import ThreadPool

dask.config.set(scheduler='threads')
# dask.config.set(pool=Pool(5))
dask.config.set(pool=ThreadPool(5))
df: DataFrame = DataFrame.read_sql_table("products", "postgresql://tilak@localhost:5432/datamart", "region_code")
print(df)
with ProgressBar():
    df.to_parquet("/tmp/products")

Ejemplo n.º 25

0

Mostrar archivo

Archivo: dask_task.py Proyecto: cesaregarza/DataPlayground

    feature_json        = spotify_audio_features(final_tracks)
    feature_df          = parse_multiple_features(feature_json)

    merged_df           = track_df.merge(feature_df, how="inner", left_on="track_id", right_on="id")
    dfs.append(merged_df)

    #Concatenate all the dataframes, ignore the index, and drop duplicates created in the last step
    return pd.concat(dfs, ignore_index=True).drop_duplicates(keep="first").reset_index(drop=True)


# %%
from dask.distributed import Client
db_path = "sqlite:///last_fm.db"
client = Client(n_workers = 1, threads_per_worker=16, memory_limit="25GB", processes=False)
client
feature_ddf = dd.read_sql_table("FEATURES", db_path, "index", npartitions=10)
genre_track_ddf = dd.read_sql_table("TRAINING_DB_2", db_path, "index", npartitions=10)
genre_track_ddf = genre_track_ddf.loc[~genre_track_ddf['track_id'].isna()].set_index("track_id")


# %%
genre_feature_ddf = feature_ddf.merge(genre_track_ddf, on=func_dict.df_columns.track_id).set_index("track_id")


# %%
from dask_ml.preprocessing import Categorizer, DummyEncoder
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(
    Categorizer(),
    DummyEncoder()

Ejemplo n.º 26

0

Mostrar archivo

Archivo: sql_to_dataframe.py Proyecto: zwelshman/code_toolkit

        )
        raise ValueError(
            "A very specific bad thing happened, chase up the error tree to find out."
        )

    return sqlDF
   
def sqltoDF2(query, path, database, server):
    query = SQL file (query.SQL)
    path = location of SQL file
    database = database ()
    server = server ()
    
    example useage
    sqltoDF(query='query.sql',path='../../src/', database = '', server='')
    ''''''
    conn = create_engine(f'mssql+pyodbc://{database}/{server}?driver=SQL+Server')
    query = open(path + query)
    sqlDF = pd.read_sql_query(query.read(), conn)
    return sqlDF

#for dask use URI
server = ''
database = ''
username = ""
password = ""

params = urllib.parse.quote_plus("DRIVER={xxxx};SERVER=" + server + ";DATABASE=" + database)
uri=("xxx+pyodbc:///?odbc_connect=%s" % params)
df = dd.read_sql_table("Value", uri=uri, npartitions=10, index_col="ID", schema='', head_rows=5)