def get_sidewalk_data(**kwargs): """Get sidewalk condition data from DB.""" sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__) sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql') df = sw_conn.get_pandas_df(sw_query) # Rename columns we're keeping df = df.rename( columns={ 'sap_id': 'seg_id', 'legacy_id': 'geojoin_id', 'inspectiondate': 'oci_date', 'rating': 'oci_desc', 'condition': 'oci' }) df = df.drop(['cgLastModified', 'MaxInspect', 'MaxMod'], axis=1) # Write csv logging.info('Writing ' + str(df.shape[0])) general.pos_write_csv(df, cond_file, date_format=conf['date_format_ymd']) return "Successfully wrote prod file"
def get_sidewalk_data(**kwargs): """Get sidewalk condition data from DB.""" sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__) sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql') df = sw_conn.get_pandas_df(sw_query) # Rename columns we're keeping df = df.rename(columns={ 'sap_id': 'seg_id', 'legacy_id': 'geojoin_id', 'inspectiondate': 'oci_date', 'rating': 'oci_desc', 'condition': 'oci' }) df = df.drop(['cgLastModified', 'MaxInspect', 'MaxMod'],axis=1) # Write csv logging.info('Writing ' + str(df.shape[0])) general.pos_write_csv( df, cond_file, date_format=conf['date_format_ymd']) return "Successfully wrote prod file"
def get_streets_paving_data(): """Get streets paving data from DB.""" pv_query = general.file_to_string('./sql/pavement_ex.sql', __file__) pv_conn = MsSqlHook(mssql_conn_id='streets_cg_sql') df = pv_conn.get_pandas_df(pv_query) results = df.shape[0] general.pos_write_csv( df, temp_file) return f"Successfully wrote temp file with {results} records"
def get_special_events(): """Get special events from DB.""" se_query = general.file_to_string('./sql/special_events.sql', __file__) se_conn = MsSqlHook(mssql_conn_id='special_events_sql') df = se_conn.get_pandas_df(se_query) df['event_id'] = pd.to_numeric(df['event_id'], errors='coerce', downcast='integer') general.pos_write_csv(df, temp_file, date_format=conf['date_format_ymd_hms']) return "Retrieved special events to temp file."
def execute(self, context): mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id) df = mssql_hook.get_pandas_df(self.select_sql) # Convert columns data types to int if self.int_columns: for col in self.int_columns: df[col] = df[col].astype("Int64") # Create folder if not exists if not os.path.exists(self.target_file_dir): os.mkdir(self.target_file_dir) file_path = os.path.join(self.target_file_dir, self.file_name) df.to_csv(file_path, index=False)
def execute(self, context): self.log.info("Querying data from source: {0}".format( self.mssql_source_conn_id)) src_mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_source_conn_id) df = src_mssql_hook.get_pandas_df(sql=self.source_sql, parameters=self.source_sql_params) rows_total = df.shape[0] self.log.info("Writing data to {0}.".format(self.destination_filepath)) df.to_csv(self.destination_filepath, sep=self.sep, index=False) self.log.info("Total inserted to file: {0} rows".format(rows_total)) return rows_total
def get_fd_data(**kwargs): """Get fire department data from Data Base.""" logging.info("Get fire department data from CAD archive") fd_query = general.file_to_string('./sql/fd.sql', __file__) fd_conn = MsSqlHook(mssql_conn_id='fire_department') logging.info("Read data to panda DataFrame") df = fd_conn.get_pandas_df(fd_query) # Write csv logging.info('Writing ' + prod_file) general.pos_write_csv(df, prod_file, date_format=conf['date_format_ymd_hms']) return "Successfully wrote prod file at " + prod_file
def get_fd_data( **kwargs): """Get fire department data from Data Base.""" logging.info("Get fire department data from CAD archive") fd_query = general.file_to_string('./sql/fd.sql', __file__) fd_conn = MsSqlHook(mssql_conn_id='fire_department') logging.info("Read data to panda DataFrame") df = fd_conn.get_pandas_df(fd_query) # Write csv logging.info('Writing ' + prod_file) general.pos_write_csv( df, prod_file, date_format=conf['date_format_ymd_hms']) return "Successfully wrote prod file at " + prod_file
def get_sidewalk_gis(**kwargs): """ Get sidewalk geodatabase from shared drive""" sde_query = general.file_to_string('./sql/sidewalk_sde.sql', __file__) sde_conn = MsSqlHook(mssql_conn_id='atlas_sde') #sde_server = conf['sde_server'] #sde_user = conf['sde_user'] #sde_pw = conf['sde_pw'] #sde_conn = pymssql.connect(sde_server, sde_user, sde_pw, 'sdw') #query = "SELECT *, [Shape].STAsText() as geom FROM SDW.IAMSD.SIDEWALK" df = sde_conn.get_pandas_df(sde_query) df.columns = [x.lower() for x in df.columns] df = df.drop('shape', 1) df = df.rename( columns={ 'sapid': 'seg_id', 'cdcode': 'council', 'cpcode': 'comm_plan', 'legacy_id': 'geojoin_id', 'iamfloc': 'fun_loc_id', 'loc_descr': 'loc_desc' }) df = df.fillna('') logging.info('processed {} rows'.format(rows)) geospatial.df2shp(df=df, folder=prod_dir, layername=layername, dtypes=dtypes, gtype=gtype, epsg=2230) return "Converted table to shapefile"
def get_documentum(mode, **kwargs): """Get tables from Documentum.""" logging.info('Getting files from documentum') table_name = dn.table_name(mode) for name in table_name: logging.info('Querying for {0} table'.format(name)) query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name) logging.info('Connecting to MS Database') documentum_conn = MsSqlHook(mssql_conn_id='docm_sql') logging.info('Reading data to Pandas DataFrame') df = documentum_conn.get_pandas_df(query_string) logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower()) logging.info('Writing Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved Documentum tables"
def get_fd_data(**kwargs): """Get fire department data from Data Base.""" logging.info("Get fire department data from Data Base") fd_query = general.file_to_string('./sql/fd_pn.sql', __file__) fd_conn = MsSqlHook(mssql_conn_id='fire_department') logging.info("Read data to panda DataFrame") df = fd_conn.get_pandas_df(fd_query) df = df.rename( columns={ 'city': 'address_city', 'response_month': 'month_response', 'response_year': 'year_response' }) logging.info("Writing {} rows to prod".format(df.shape[0])) # Write csv general.pos_write_csv(df, prod_file) return "Successfully wrote prod file"
def get_onbase(): """Get tables from OnBase.""" logging.info('Getting files from onbase') for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'): for name in files: logging.info('Querying for '+name) path = './sql/onbase/{}'.format(name) query_string = general.file_to_string(path, __file__) logging.info('Connecting to MS Database') onbase_conn = MsSqlHook(mssql_conn_id='onbase_sql') logging.info('Reading data to Pandas DataFrame') df = onbase_conn.get_pandas_df(query_string) table_type = name[0:-4] logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = '{0}/onbase_{1}.csv'.format(conf['prod_data_dir'],table_type) logging.info('Writting Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved OnBase tables"
def get_onbase_test(): """Get tables from OnBase.""" logging.info('Getting files from onbase') for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'): for name in files: logging.info('Querying for '+name) path = './sql/onbase/{}'.format(name) query_string = general.file_to_string(path, __file__) logging.info('Connecting to MS Database') onbase_conn = MsSqlHook(mssql_conn_id='onbase_test_sql') logging.info('Reading data to Pandas DataFrame') df = onbase_conn.get_pandas_df(query_string) table_type = name[0:-4] logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = '{0}/onbase_test_{1}.csv'.format(conf['prod_data_dir'],table_type) logging.info('Writting Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved OnBase tables"
def execute(self, context): self.log.info("Querying data from source: {0}".format( self.mssql_source_conn_id)) src_mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_source_conn_id) df = src_mssql_hook.get_pandas_df(sql=self.source_sql, parameters=self.source_sql_params) rows_total = df.shape[0] self.log.info("Writing data to {0}.".format(self.destination_filepath)) writer = pd.ExcelWriter(self.destination_filepath, engine=self.excel_engine) if self.write_mode == 'append_tab' and path.isfile( self.destination_filepath): writer.book = load_workbook(self.destination_filepath) pandas.io.formats.excel.header_style = None if self.sheet_name is not None: df.to_excel(writer, sheet_name=self.sheet_name, index=False) else: df.to_excel(writer, index=False) writer.save() self.log.info("Total inserted to file: {0} rows".format(rows_total))
def get_documentum(mode, **kwargs): """Get tables from Documentum.""" logging.info('Getting files from documentum') table_name = dn.table_name(mode) for name in table_name: logging.info('Querying for {0} table'.format(name)) query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name) logging.info('Connecting to MS Database') documentum_conn = MsSqlHook(mssql_conn_id='docm_sql') logging.info('Reading data to Pandas DataFrame') df = documentum_conn.get_pandas_df(query_string) logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) if mode == 'schedule_24': save_path = conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower()) else: save_path = conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower()) logging.info('Writing Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved Documentum tables"
def get_documentum_test(): """Get tables from Documentum test database.""" logging.info('Getting files for documentum test') table_name = dn.table_name('schedule_daily')+dn.table_name('schedule_hourly_15')+dn.table_name('schedule_hourly_30') logging.info(table_name) for name in table_name: logging.info('Querying for {0} table'.format(name)) query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name) logging.info('Connecting to MS Database') documentum_conn = MsSqlHook(mssql_conn_id='docm_test_sql') logging.info('Reading data to Pandas DataFrame') try: df = documentum_conn.get_pandas_df(query_string) logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = conf['prod_data_dir'] + '/documentum_{0}_test.csv'.format(name.lower()) general.pos_write_csv(df, save_path) except Exception as e: logging.info(f'Could not read {0} because {e}') return "Successfully retrieved Documentum tables"
def get_streets_paving_data(mode='sdif', **kwargs): """Get streets paving data from DB.""" pv_query = general.file_to_string('./sql/pavement_ex.sql', __file__) pv_conn = MsSqlHook(mssql_conn_id='streets_cg_sql') moratorium_string = "Post Construction" phone_UTLY = "858-627-3200" phone_OTHER = "619-527-7500" TSW_PM = "*****@*****.**" UTLY_PM = "*****@*****.**" ACT_OVERLAY_CONCRETE_PM = "*****@*****.**" ACT_SLURRY_SERIES_PM = "*****@*****.**" # Different String for imcat mode. if mode == 'imcat': moratorium_string = "Post-Construction" df = pv_conn.get_pandas_df(pv_query) for i in [ 'seg_id', 'rd_seg_id', 'wo_id', 'wo_name', 'wo_status', 'wo_proj_type', 'job_activity', 'seg_func_class' ]: df[i] = df[i].astype(str) df['job_completed_cbox'] = df['job_completed_cbox'].astype(bool) # Backfill - set all fields to mora df.loc[df.wo_status.str.contains( 'post construction|moratorium|post-construction', regex=True, case=False), "wo_status"] = moratorium_string # Remove Records w/o A Completed Date ONLY in the UTLY and TSW work order # IMCAT ONLY if mode == 'imcat': df = df.query('not '\ + '((wo_id == "UTLY" & job_end_dt.isnull()) '\ + 'or (wo_id == "TSW" & job_end_dt.isnull()))') # Remove empty activities (IMCAT ONLY) df = df.query('not '\ + '(job_activity.isnull() '\ + '| job_activity == "" '\ + '| job_activity == "None")') # Remove Data Entry # Remove mill / pave # Remove Structure Widening # Remove Patching if mode == 'imcat': remove_search = 'data entry|mill|structure wid|patching' else: remove_search = 'data entry|structure wid|patching' df = df[~(df.job_activity.str.contains( remove_search, regex=True, case=False))] # Search Strings concrete_search = "panel rep|pcc - reconstruc" slurry_search = "surface treatment|scrub seal|cape seal|central mix" overlay_search = "resurfacing|overlay|ac overlay|mill|ac - overlay|ac - ac overlay|ac - reconstruct|ac inlay" # Set Proj Type to NA df['wo_proj_type'] = None # Concrete df.loc[df.job_activity.str.contains( concrete_search, regex=True, case=False), 'wo_proj_type'] = 'Concrete' # Slurry df.loc[df.job_activity.str.contains( slurry_search, regex=True, case=False), 'wo_proj_type'] = 'Slurry' # Overlay df.loc[df.job_activity.str.contains( overlay_search, regex=True, case=False), 'wo_proj_type'] = 'Overlay' # Remove All Records over 5 Years Old; #pv <- pv[(as.Date(pv$job_end_dt) > (today() - years(5))) | is.na(pv$job_end_dt),] # Create ref dates #today = kwargs['execution_date'] today = general.today() five_yrs_ago = today.replace(year=(today.year - 5)) three_yrs_ago = today.replace(year=(today.year - 3)) # Remove records df = df[(df.job_end_dt > five_yrs_ago) | (df.job_end_dt.isnull())] # Remove Slurry Records > 3 Years Old # IMCAT ONLY if mode == 'imcat': mask = ~((df.wo_proj_type == 'Slurry') & (df.job_end_dt < three_yrs_ago)) df = df[mask] # Create a feature for completed jobs df['final_job_completion_state'] = False #pv[(!is.na(pv$job_end_dt) & pv$job_completed_cbox == 1), "final_job_completion_state"] <- 1 df.loc[df.job_end_dt.notnull(), "final_job_completion_state"] = True # Set all completed jobs to Moratorium status df.loc[df.final_job_completion_state == True, "wo_status"] = moratorium_string # Set Dates in The future for TSW work orders as Construction. mask = (df.wo_id == 'TSW') & \ (df.job_end_dt.notnull()) & \ (df.job_end_dt > today) df.loc[mask, "wo_status"] = "Construction" # Set Phone # For UTLY df.loc[df.wo_id == 'UTLY', 'wo_pm_phone'] = phone_UTLY # Set Phone # for Everything else df.loc[df.wo_id != 'UTLY', 'wo_pm_phone'] = phone_OTHER # Set PM for UTLY df.loc[df.wo_id == 'UTLY', 'wo_pm'] = UTLY_PM # Set PM for TSW df.loc[df.wo_id == 'TSW', 'wo_pm'] = TSW_PM # Set PM for Overlay / Concrete #mask = (df.wo_proj_type == 'Overlay') | (df.wo_proj_type == 'Concrete') & (df.wo_pm.isnull()) mask = (df.wo_pm.isnull()) & ((df.wo_proj_type == 'Overlay') | (df.wo_proj_type == 'Concrete')) df.loc[mask, 'wo_pm'] = ACT_OVERLAY_CONCRETE_PM # Set PM for Slurry / Series mask = (df.wo_pm.isnull()) & ((df.wo_proj_type == 'Slurry') | (df.wo_proj_type == 'Series Circuit')) df.loc[mask, 'wo_pm'] = ACT_SLURRY_SERIES_PM # Spot Unknown mask = (df.job_activity.isnull()) | (df.job_activity == None) | (df.job_activity == 'None') | (df.job_activity == '')\ |(df.wo_proj_type.isnull()) | (df.wo_proj_type == None) | (df.wo_proj_type == 'None') | (df.wo_proj_type == '')\ |(df.wo_status.isnull()) | (df.wo_status == None) | (df.wo_status == 'None') | (df.wo_status == '') spot_unknown = df[mask] logging.info('Found {} records with no activity, type or status'.format( spot_unknown.shape[0])) # Remove unknown df = df[~mask] # Sort by job end date time df = df.sort_values(by='job_end_dt', na_position='last', ascending=False) # Remove duplicates, although it doesn't make sense # This is wrong. df = df.drop_duplicates('seg_id', keep='first') # Rename columns we're keeping df = df.rename(columns={ 'pve_id': 'PVE_ID', 'seg_id': 'SEG_ID', 'wo_id': 'PROJECTID', 'wo_name': 'TITLE', 'wo_pm': 'PM', 'wo_pm_phone': 'PM_PHONE', 'wo_design_start_dt': 'START', 'wo_design_end_dt': 'END', 'wo_resident_engineer': 'RESIDENT_ENGINEER', 'job_end_dt': 'MORATORIUM', 'wo_status': 'STATUS', 'wo_proj_type': 'TYPE', 'seg_length_ft': 'LENGTH', 'seg_width_ft': 'WIDTH', }) # Regex for the keeps: df = df.filter(regex="^[A-Z0-9]") # Remove START and END for projects in moratorium: df.loc[df.STATUS == moratorium_string, ['START', 'END']] = None # For IMCAT uppercase status if mode == 'imcat': df['STATUS'] = df['STATUS'].str.upper() # Write csv logging.info('Writing ' + str(df.shape[0]) + ' rows in mode ' + mode) general.pos_write_csv( df, prod_file[mode], date_format=conf['date_format_ymd_hms']) return "Successfully wrote prod file at " + prod_file[mode]