Exemple #1
0
def drop_table(tbl_name: str, eng: sa.engine.Engine, dispose_eng=False):
    try:
        if eng.has_table(tbl_name):
            with eng.begin() as con:
                con.execute(f"DROP TABLE {tbl_name}")
    finally:
        if dispose_eng:
            eng.dispose()
Exemple #2
0
def merge_to_table(eng: sa.engine.Engine,
                   df: pd.DataFrame,
                   table_name: str,
                   compare_columns: List[str],
                   update_columns: List[str],
                   dtypes: dict,
                   temp_table_name: str = None,
                   dispose_eng=False):
    """
    Merges the dataframe into an existing table by creating a temp table with for the df and the merging it into the existing one.
    For rows with matching $compare columns we UPDATE the other values in $UPDATE_COLUMNS
    Args:
        eng: An engine object connecting the db
        df: The dataframe we want to insert to the DB
        table_name: The existing table's name
        compare_columns: The columns we want to compare existing rows with
        update_columns: The columns we want to update in case a matching row is found
        temp_table_name: optional, a name for the temp table for the DB
        dtypes: The data-types for each column in the DB
        dispose_eng: Whether to dispose of the engine after the read

    Returns:
        None
    """
    table_name = table_name.lower()  # fixes stuff for postgres

    if df.empty:
        return

    if dtypes is None:
        dtypes = {}
    if temp_table_name is None:
        temp_table_name = get_temp_table_name()
    if eng.dialect.name.lower() == "oracle" and (len(temp_table_name) > MAX_TABLE_NAME_ORACLE or \
                                                 len(table_name) > MAX_TABLE_NAME_ORACLE):
        raise Exception('table name is too long')

    if len(df) > 200_000:
        chunk_size = 100_000
        for i in tqdm(range(0, len(df), chunk_size),
                      desc=f"Merging into {table_name}",
                      unit="100_000 chunk"):
            df_chunk = df.iloc[i:min(len(df), i + chunk_size)]
            merge_to_table(eng,
                           df_chunk,
                           table_name,
                           compare_columns,
                           update_columns,
                           dtypes=dtypes)

    else:
        try:
            # logger = logging.getLogger()
            # logger.info(f"Writing {len(df)} rows to {table_name} table")
            if not eng.has_table(table_name):
                insert_into_table(eng, df, table_name, dtypes, compare_columns)
            else:
                if eng.dialect.name.lower() not in ("oracle", "postgresql"):
                    raise RuntimeError(
                        f"merge into does not work for {eng.dialect.name}")

                insert_into_table(eng, df, temp_table_name, dtypes,
                                  compare_columns)

                if eng.dialect.name.lower() == "oracle":
                    on_statment = "\nAND ".join(
                        [f"curr.{col} = tmp.{col}" for col in compare_columns])
                    set_statment = "\n,".join(
                        [f"curr.{col} = tmp.{col}" for col in update_columns])
                    all_columns = compare_columns + update_columns
                    all_columns_names = ",".join(all_columns)
                    all_columns_values = ",".join(
                        [f"tmp.{col}" for col in all_columns])
                    sql = f"""
                    merge into {table_name} curr
                    using (select {all_columns_names} from {temp_table_name}) tmp
                    on ({on_statment})
                    when matched then
                    update set {set_statment}
                    when not matched then
                    insert ({all_columns_names})
                    values ({all_columns_values})
                    """
                else:  # postgresql
                    set_statment = ",".join([
                        f"{col} = EXCLUDED.{col}" for col in update_columns
                    ])  # postgres syntax
                    all_columns = compare_columns + update_columns
                    all_columns_names = ",".join(all_columns)
                    from coord2vec.common.db.postgres import get_index_str_for_unique
                    on_statment = get_index_str_for_unique(
                        compare_columns, dtypes)

                    sql = f"""
                    INSERT INTO {table_name} ({all_columns_names})
                    SELECT {all_columns_names}
                    FROM {temp_table_name} tmp
                    
                    ON CONFLICT ({on_statment})
                    DO UPDATE SET {set_statment};
                    """  # can fail if no key is saved on the on_statement columns

                with eng.begin() as con:
                    con.execute(sql)
                    con.execute(f"drop table {temp_table_name}")
        finally:
            if eng.has_table(temp_table_name):
                with eng.begin() as con:
                    con.execute(f"drop table {temp_table_name}")

    if dispose_eng:
        eng.dispose()