def table_create(df: pd.DataFrame, tablename: str, conn: str = None): """ pulls all fields from dataframe and constructs a postgres table schema; using that schema, create new table in postgres. """ table_fields = {} try: for i in df.columns: if tablename != 'aero_runs': table_fields.update( {f'{i}': f'{type_translate[df.dtypes[i].name]}'}) else: table_fields.update( {f'{i}': f'{aero_translate[df.dtypes[i].name]}'}) if table_fields: comm = sql_command(table_fields, tablename, conn) if conn != 'nri' else sql_command( table_fields, tablename, 'nritest') d = db(f'{conn}') con = d.str cur = con.cursor() # return comm cur.execute(comm) con.commit() except Exception as e: print(e) d = db(f'{conn}') con = d.str cur = con.cursor()
def quick_ingest(whichset): internal_dict = {} selectedset = whichset # for current data -- dat files count = 1 if isinstance(whichset['akron'], str): for i in selectedset.items(): if "BellevueTable1" not in i: print("processing: ", i) fullpath = os.path.join(files_path, i[1]) projk = projectkey_extractor(fullpath) inst = datScraper(fullpath) df = inst.getdf() df['ProjectKey'] = projk df = second_round(df) df = df.loc[pd.isnull(df.TIMESTAMP) != True] if any( pd.isnull(df.TIMESTAMP.unique())) else df internal_dict.update({f"df{count}": df}) count += 1 # return internal_dict # for historic data -- cs elif isinstance(whichset['akron'], list): for i in selectedset.items(): for j in i[1]: # print(j) print("processing: ", j) fullpath = os.path.join(files_path, j) projk = projectkey_extractor(fullpath) inst = datScraper(fullpath) df = inst.getdf() df['ProjectKey'] = name_in_pg[projk] df = second_round(df) df = df.loc[pd.isnull(df.TIMESTAMP) != True] if any( pd.isnull(df.TIMESTAMP.unique())) else df internal_dict.update({f"df{count}": df}) count += 1 # for i in internal_dict.items(): finaldf = pd.concat([i[1] for i in internal_dict.items()]) table_create(finaldf, "raw_met_data", "met") if tablecheck("raw_met_data", "met") != True else None try: d = db("met") ingesterv2.main_ingest(finaldf, "raw_met_data", d.str, 100000) except Exception as e: print(e) d = db("met")
def rowDeleter(tablename:str,dev=False,**fields:str): # where to delete from: dev or public print(f"amount of key-value pairs:{len(fields)}") if dev==False: d = db('dima') keyword = "public" elif dev==True: d = db("dimadev") keyword = "dimadev" # creating connection/cursor object (for dev or public) con = d.str cur = con.cursor() # how to structure SQL depending on number of key-value pairs if len(fields)<2: print(f"deleting 1 key-value pair from postgres.{keyword}.{tablename}") for key,value in fields.items(): singleField = f'''where "{key}"=\'{value}\';''' printOut = f' "{key}" = \'{value}\'' # return f'DELETE FROM postgres.public."{tablename}" {singleField}' try: sqlStart = f"DELETE FROM postgres.{keyword}.\"{tablename}\" {singleField}" print(f"Removing {printOut}...") print(f"Using SQL verb : {sqlStart}") cur.execute(sqlStart) print("Done.") con.commit() except Exception as e: print(e) con = d.str cur = con.cursor() elif len(fields)>=2: print(f"deleting {len(fields)} key-value pairs from postgres.{keyword}.{tablename}") endVerb = keyvalAdder(fields) # return f'DELETE FROM postgres.public."{tablename}" {endVerb};' try: sqlStart = f"DELETE FROM postgres.{keyword}.\"{tablename}\" {endVerb};" print(f"Removing rows using various key-value pairs...") print(f"Using SQL verb : {sqlStart}") cur.execute(sqlStart) print("Done.") con.commit() except Exception as e: print(e) con = d.str cur = con.cursor()
def sql_command_daterange(date_key_tuple): date1 = date_key_tuple[0] date2 = date_key_tuple[1] pk = date_key_tuple[2] str = f""" SELECT EXISTS( SELECT * FROM public.raw_met_data WHERE ( "TIMESTAMP" >= '{date1}'::timestamp AND "TIMESTAMP" < '{date2}'::timestamp ) and "ProjectKey"='{pk}' ); """ try: d = db('met') con = d.str cur = con.cursor() cur.execute(str) if cur.fetchone()[0]: return True else: return False except Exception as e: print(e) con = d.str cur = con.cursor()
def single_pg_send(df, tablename): """ uses: new_tablename, tablecheck, ingesterv2.main_ingest table_create, datetime_type_assert """ d = db('dima') print("STARTING INGEST") df = blank_fixer(df) df = significant_digits_fix_pandas(df) df = datetime_type_assert(df) df = openingsize_fixer(df) if "openingSize" in df.columns else df if 'ItemType' in df.columns: newtablename = new_tablename(df) if tablecheck(newtablename): print(f'network table "{newtablename}" exists, ingesting..') ingesterv2.main_ingest(df, newtablename, d.str, 10000) else: table_create(df, newtablename, 'dima') print(f'created network table: {newtablename}') ingesterv2.main_ingest(df, newtablename, d.str, 10000) else: print("not a network table") newtablename = table if tablecheck(table): print("FOUND THE TABLE IN PG") ingesterv2.main_ingest(df, newtablename, d.str, 10000) else: print("DID NOT FIND TABLE IN PG, CREATING...") table_create(df, table, 'dima') ingesterv2.main_ingest(df, newtablename, d.str, 10000)
def has_duplicate_pks(df, tablename): d = db("dima") try: con = d.str cur = con.cursor() exists_query = f''' select "PrimaryKey" from postgres.public."{tablename}" ''' cur.execute(exists_query) df_pg_keys = list(set([i for i in cur.fetchall()])) if 'PrimaryKey' in df.columns: for i in df.PrimaryKey: if i in df_pg_keys: return True else: return False else: print('table has no primary key. aborting check') except Exception as e: print(e) con = d.str cur = con.cursor()
def tall_ingest(csv_path, table): d = db('geo') model_choice = { "dataheader": [dataHeader, 'dataHeader'], "datagap": [dataGap, 'dataGap'], "datalpi": [dataLPI, 'dataLPI'], "dataheight": [dataHeight, 'dataHeight'], "datasoilstability": [dataSoilStability, 'dataSoilStability'], "dataspeciesinventory": [dataSpeciesInventory, 'dataSpeciesInventory'], "geoindicators": [geoindicators, 'geoIndicators'], "geospecies": [geoSpecies, 'geoSpecies'] } m = model_handler(csv_path, model_choice[table][0], model_choice[table][1]) sans_null = m.checked_df.loc[~pd.isnull(m.checked_df.PrimaryKey) == True].copy() for i in sans_null.columns: if i in fields_to_drop[table.lower()]: sans_null.drop(columns=[i], inplace=True) m.create_empty_table(d.str) ing = ingesterv2(d.str) ingesterv2.main_ingest(sans_null, model_choice[table][1], d.str, 100000) if model_choice[table][1] in [i for i in missing_pks]: for i in missing_pks[model_choice[table][1]]: ing.drop_rows(d.str, model_choice[table][1], "PrimaryKey", i) if "header" not in table: ing.reestablish_fk(model_choice[table][1])
def pg2csvExport(tablename=None,dev=False,dbkey=None): path = os.getcwd() # where to delete from: dev or public if dev==False: d = db('dima') keyword = "public" conn = "dima" elif dev==True: d = db("dimadev") keyword = "dimadev" conn = "dimadev" # creating connection/cursor object (for dev or public) engine = engine_conn_string(conn) # con = d.str # cur = con.cursor() nopk = "DBKey" if "tblPlots" in tablename: nopk = "ProjectKey" else: nopk = "DBKey" if tablecheck(tablename,conn=conn): if (dbkey is not None) and (tablename is not None): # both table and projectkey print(f"fetching pgdata that includes:{tablename}, {dbkey}...") sql = f"SELECT * from postgres.{keyword}.\"{tablename}\" where \"{nopk}\" = '{dbkey}'" df = pd.read_sql(sql, con=engine) csv_fieldcheck(df,path,tablename) print("done.") elif (dbkey is None) and (tablename is not None): # only table print(f"fetching pgdata that includes:{dbkey}...") sql = f"SELECT * from postgres.{keyword}.\"{tablename}\"" df = pd.read_sql(sql,con=engine) csv_fieldcheck(df,path,tablename) print("done.") elif (dbkey is not None) and (tablename is None): print("csv all tables with x projectkey;not implemented") else: print("no table, no projectkey; not implemented") else: print(f"{tablename} does not exist in {conn} database")
def tablecheck(tablename, conn="dima"): """ receives a tablename and returns true if table exists in postgres table schema, else returns false """ tableschema = "dimadev" if conn=="dimadev" else "public" try: d = db(f'{conn}') con = d.str cur = con.cursor() cur.execute("select exists(select * from information_schema.tables where table_name=%s and table_schema=%s)", (f'{tablename}',f'{tableschema}',)) if cur.fetchone()[0]: return True else: return False except Exception as e: print(e) d = db(f'{conn}') con = d.str cur = con.cursor()
def row_check(df): d = db("met") con = d.str for i in range(len(df)): try: if pg_check(df.iloc[i:i + 1].TIMESTAMP.values[0], df.iloc[i:i + 1].ProjectKey.values[0]): print( f'timestamp:"{df.iloc[i:i+1].TIMESTAMP.values[0]}" and projectkey: "{df.iloc[i:i+1].ProjectKey.values[0]}" already in database, moving on..' ) else: print( f'ingesting timestamp:"{df.iloc[i:i+1].TIMESTAMP.values[0]}" and projectkey: "{df.iloc[i:i+1].ProjectKey.values[0]}" ' ) ingesterv2.main_ingest(df.iloc[i:i + 1], "raw_met_data", d.str, 10000) except Exception as e: print(e) d = db("met") con = d.str
def add_modelrunkey_to_pg(): d = db("aero") add_query = ''' ALTER TABLE IF EXISTS "ModelRuns" ADD COLUMN "ModelRunKey" TEXT; ''' try: con = d.str cur = con.cursor() cur.execute(add_query) con.commit() except Exception as e: print(e, 'error adding column to modelruns table') con = d.str cur = con.cursor()
def pull_max_date(projectkey): qry = f""" SELECT MAX("TIMESTAMP") FROM public.raw_met_data WHERE "ProjectKey"='{projectkey}'; """ try: d = db("met") con = d.str cur = con.cursor() cur.execute(qry) return cur.fetchone()[0] except Exception as e: print(e) con = d.str cur = con.cursor()
def drop_dbkey(table, path): squished_path = os.path.split(os.path.splitext(path)[0])[1].replace(" ","") d = db('dima') try: # print(f'"DELETE FROM postgres.public.{table} WHERE \"DBKey\"=\'{squished_path}\';"') con = d.str cur = con.cursor() cur.execute( sql.SQL("DELETE FROM postgres.public.{0} WHERE \"DBKey\"= '%s';" % squished_path).format( sql.Identifier(table)) ) con.commit() print(f'successfully dropped \'{squished_path}\' from table \'{table}\'') except Exception as e: con = d.str cur = con.cursor() print(e)
def schemaCreate(path, tablename): """ ingesting tables with less priority: schemaTable """ d = db("geo") df = pd.read_excel(path) if tablecheck(tablename, conn="geo"): print("table exists; ingesting...") ingesterv2.main_ingest(df,tablename,d.str) print("done") else: print("table does not exists; creating table..") table_create(df,tablename,conn="geo") print("ingesting...") ingesterv2.main_ingest(df,tablename, d.str) print("done")
def batch_looper(dimacontainer, pg=False): """ addition creates an exhaustive list of tables across all dimas in a folder and then uses looper to gothrough the list of tables and create csv's for each. """ d = db('dima') tablelist = None while tablelist is None: print('gathering tables within dimas..') tablelist = table_collector(dimacontainer) else: print('creating csvs for each table..') for table in tablelist: if pg != True: looper(dimacontainer, table, csv=True) else: df = looper(dimacontainer, table, csv=False) if 'ItemType' in df.columns: # if one of the non-vegetation bsne tables, use 'new_tablename' , # function to produce a new tablename: 'tblHorizontalFlux' or # 'tblDustDeposition' newtablename = new_tablename(df) if tablecheck(newtablename): print('MWACK') ingesterv2.main_ingest(df, newtablename, d.str, 10000) else: table_create(df, newtablename, 'dima') print('llegue a 2') ingesterv2.main_ingest(df, newtablename, d.str, 10000) else: print("NOT A HORFLUX TABLE") newtablename = table if tablecheck(table): print("FOUND THE TABLE IN PG") ingesterv2.main_ingest(df, newtablename, d.str, 10000) else: print("DID NOT FIND TABLE IN PG, CREATING...") table_create(df, table, 'dima') ingesterv2.main_ingest(df, newtablename, d.str, 10000)
def modelrun_key_check(modelrunkey): d = db("aero") if tablecheck("ModelRuns", "aero"): try: con = d.str cur = con.cursor() exists_query = ''' select exists ( select 1 from "ModelRuns" where "ModelRunKey" = %s )''' cur.execute(exists_query, (modelrunkey, )) return cur.fetchone()[0] except Exception as e: print(e, "error selecting modelruns table.") con = d.str cur = con.cursor() else: print("ModelRun table does not exist.")
def pg_check(timestamp, projectkey): qry = f"""SELECT EXISTS( SELECT * FROM public.raw_met_data WHERE "TIMESTAMP"='{timestamp}' and "ProjectKey"='{projectkey}');;; """ try: d = db('met') con = d.str cur = con.cursor() cur.execute(qry) if cur.fetchone()[0]: return True else: return False except Exception as e: print(e) con = d.str cur = con.cursor()
def model_run_updater(batchpath, modelrunkey, source=None): """ 1. creates a table in postgres with supplied dataframe 2. appends data to postgres table """ d = db("aero") df = txt_read(batchpath) if source != None: df['Source'] = source else: pass df['ModelRunKey'] = modelrunkey if tablecheck('aero_runs'): print('aero_runs exists, skipping table creation') update_model(batchpath, modelrunkey) ingesterv2.main_ingest(df, "aero_runs", d.str, 100000) else: print('creating aero_runs table..') table_create(df, "aero_runs", "aero") update_model(batchpath, modelrunkey) ingesterv2.main_ingest(df, "aero_runs", d.str, 100000)
def colcheck(tablename, conn): """ incomplete implementation: check ingested fields vs schema on pg """ sql=f""" SELECT * FROM information_schema.columns WHERE table_schema = 'public' AND table_name = '{tablename}' ; """ d = db(f'{conn}') if tablecheck(tablename,conn): con = d.str cur = con.cursor() try: cur.execute(sql) return [i[3] for i in cur.fetchall()] except Exception as e: print(e) con = d.str cur = con.cursor() else: print("table does not exist")
def batch_looper(dimacontainer, projkey=None, dev=False, pg=False): """ addition creates an exhaustive list of tables across all dimas in a folder and then uses looper to gothrough the list of tables and create csv's for each. """ if dev == False: d = db('dima') keyword = "dima" else: d = db("dimadev") keyword = "dimadev" tablelist = None while tablelist is None: print('gathering tables within dimas..') tablelist = table_collector(dimacontainer) print(tablelist, "tablelist check # 1") else: print('creating csvs for each table..') for table in tablelist: if pg != True: looper(dimacontainer, table, csv=True) else: df = looper(dimacontainer, table, csv=False) if 'tblPlots' not in table else looper( dimacontainer, table, projkey, csv=False) print(df.shape, "looper dataframe check # 2") if 'ItemType' in df.columns: # if one of the non-vegetation bsne tables, use 'new_tablename' , # function to produce a new tablename: 'tblHorizontalFlux' or # 'tblDustDeposition' newtablename = new_tablename(df) if tablecheck(newtablename, keyword): print('MWACK') df = dateloadedcheck(df) df = ovenTemp_INT(df) ingesterv2.main_ingest(df, newtablename, d.str, 10000) else: table_create(df, newtablename, keyword) print('DDT ') df = dateloadedcheck(df) df = ovenTemp_INT(df) ingesterv2.main_ingest(df, newtablename, d.str, 10000) else: print("NOT A HORFLUX TABLE") newtablename = table if tablecheck(table, keyword): print(f"FOUND THE {newtablename} IN PG") df = dateloadedcheck(df) ingesterv2.main_ingest(df, newtablename, d.str, 10000) else: print( f"DID NOT FIND {newtablename} IN PG, CREATING...") table_create(df, table, keyword) df = dateloadedcheck(df) ingesterv2.main_ingest(df, newtablename, d.str, 10000)
def met_batcher(path, whichdata=None): type = {"historic": ".csv", "current": ".dat"} df_dict = {} folderlist = os.listdir(path) count = 1 d = db("met") start = time.time() for i in folderlist: # print(i) for j in os.listdir(os.path.join(path, i)): if "historic" in whichdata: if os.path.splitext(os.path.join(path, i, j))[1] == type[whichdata]: print(j) startlocal = time.time() local_path = os.path.join(path, i, j) proj_key = i ins = datReader(local_path) tempdf = ins.getdf() tempdf = second_round(tempdf) tempdf = col_name_fix(tempdf) tempdf = new_instrumentation(tempdf) tempdf = remove_emptytimestamps(tempdf) tempdf = type_fix(tempdf) tempdf['ProjectKey'] = proj_key # dat_updater(tempdf) # tempdf = tempdf.loc[pd.isnull(tempdf.TIMESTAMP)!=True] if any(pd.isnull(tempdf.TIMESTAMP.unique())) else tempdf df_dict.update({f'df{count}': tempdf}) now = time.time() elapsed = round(now - startlocal, 2) print(f'time elapsed for {j} dataset: {elapsed}s') count += 1 if "current" in whichdata: if os.path.splitext(os.path.join( path, i, j))[1] == type[whichdata] and ('Bellevue' not in path): print(j) startlocal = time.time() local_path = os.path.join(path, i, j) proj_key = i ins = datReader(local_path) tempdf = ins.getdf() tempdf = second_round(tempdf) tempdf = col_name_fix(tempdf) tempdf = new_instrumentation(tempdf) tempdf = remove_emptytimestamps(tempdf) tempdf['ProjectKey'] = proj_key # dat_updater(tempdf) tempdf = tempdf.loc[ pd.isnull(tempdf.TIMESTAMP) != True] if any( pd.isnull(tempdf.TIMESTAMP.unique())) else tempdf df_dict.update({f'df{count}': tempdf}) now = time.time() elapsed = round(now - startlocal, 2) print(f'time elapsed for {j} dataset: {elapsed}s') count += 1 # return df_dict prefix = pd.concat([i[1] for i in df_dict.items()]) prefix = type_fix(prefix) prefix.TIMESTAMP = prefix.TIMESTAMP.astype("datetime64") # finaldf = type_fix(prefix) finalnow = time.time() finalelapsed = finalnow - start print(f'final elapsed time: {finalelapsed}') return prefix
def engine_conn_string(string): d = db(string) return f'postgresql://{d.params["user"]}:{d.params["password"]}@{d.params["host"]}:{d.params["port"]}/{d.params["dbname"]}'
def pg_access(tablename=None, method=None, output=None): """ sends dataframe to an access Database or pg using the `task_parser`, assembles a specific table and either sends it to postgres or exports it to an access database. if the access db does not exist, creates it in the `output` path specified. todo: - implement using only method to send all tables not just one. - implement faster access mdb write ------------------------- Args: tablename (string): table to pull using `task_parser` method (string): `pg` or `mdb`; required to choose if sending to postgres database or a local access .mdb output (string): path of directory where the access database will be created. """ df = task_parser(tablename) if method != 'mdb' else None if method == None: print('please choose \'pg\' or \'mdb\' output') elif method == 'pg': # if sending to postgres (hmmm how to include onthefly field definitions??) d = db('nri') if tablecheck(tablename): ingesterv2.main_ingest(df, tablename, d.str) else: table_create(df, tablename, 'nri') ingesterv2.main_ingest(df, tablename, d.str) elif method == 'mdb': #if sending to access db def chunker(seq, size): # used by tqdm return (seq[pos:pos + size] for pos in range(0, len(seq), size)) if tablename == None: # TODO implement reading all the tables when no tablename is specified for table in table_map.keys(): print(table) else: # create df, create field dictionary, if mdb exists: pass df into it, if not create mdb and pass df. df = task_parser(tablename) onthefly = access_dictionary(df, tablename) mdb_name = f'NRI_EXPORT_{date.today().month}_{date.today().day}_{date.today().year}.mdb' mdb_path = os.path.join(output, mdb_name) if mdb_name in os.listdir(output): # print('ok') chunksize = int(len(df) / 10) with tqdm(total=len(df)) as pbar: for i, cdf in enumerate(chunker(df, chunksize)): replace = "replace" if i == 0 else "append" cdf.to_sql(name=f'{tablename}', con=ret_access(mdb_path), index=False, if_exists=replace, dtype=onthefly, chunksize=chunksize) pbar.update(chunksize) tqdm._instances.clear() else: # print('no') mdb_create(output) chunksize = int(len(df) / 10) with tqdm(total=len(df)) as pbar: for i, cdf in enumerate(chunker(df, chunksize)): replace = "replace" if i == 0 else "append" cdf.to_sql(name=f'{tablename}', con=ret_access(mdb_path), index=False, if_exists=replace, dtype=onthefly, chunksize=chunksize) pbar.update(chunksize) tqdm._instances.clear()
def pg_send(table: str, path: str, csv=None, debug=None): plot = None """ Sends dataframe to postgres or prints out CSV. Given a Dima path and tablename, uses the function main_translate to create a dataframe and either send it to a postgres database, or print it out to a CSV in the same directory where the DIMA file is in. Parameters ---------- table : str Name of the table in DIMA. example: 'tblLines' path : str Physical path to DIMA Access file. example: 'c://Folder/dima_file.mdb' csv : None or any character. (default value: None) If not None, it will print out the table dataframe to CSV. debug : None or any character. (default value: None) If not None, it will print out each of the steps the function's processes take. """ d = db('dima') df = main_translate(table, path) print("STARTING INGEST") df['DateLoadedInDB'] = datetime.now().strftime("%d-%m-%Y %H:%M:%S") # dbkey add calibration HERE if ('calibration' in path) or ('Calibration' in path): df['DBKey'] = os.path.join( split(splitext(path)[0])[1].replace(" ", ""), 'calibration') else: # creates non-calibration DBKey here df['DBKey'] = split(splitext(path)[0])[1].replace(" ", "") if 'ItemType' in df.columns: # if one of the non-vegetation bsne tables, use 'new_tablename' , # function to produce a new tablename: 'tblHorizontalFlux' or # 'tblDustDeposition' newtablename = new_tablename(df) if tablecheck(newtablename): print('MWACK') ingesterv2.main_ingest(df, newtablename, d.str, 10000) if csv else csv_fieldcheck( df, path, table) else: table_create(df, newtablename, 'dima') print('llegue a 2') ingesterv2.main_ingest(df, newtablename, d.str, 10000) if csv else csv_fieldcheck( df, path, table) else: print("NOT A HORFLUX TABLE") newtablename = table if tablecheck(table): print("FOUND THE TABLE IN PG") ingesterv2.main_ingest(df, newtablename, d.str, 10000) if csv else csv_fieldcheck( df, path, table) else: print("DID NOT FIND TABLE IN PG, CREATING...") table_create(df, table, 'dima') ingesterv2.main_ingest(df, newtablename, d.str, 10000) if csv else csv_fieldcheck( df, path, table)