def clean_traffic_counts(src_fname='traffic_counts_file', out_fname='traffic_counts_raw_clean'): """Clean traffic counts data.""" xlsx_file = "{0}/{1}.xlsx"\ .format(conf['temp_data_dir'], src_fname) out_csv_file = "{0}/{1}.csv"\ .format(conf['temp_data_dir'], out_fname) names = [ 'street_name', 'limits', 'northbound_count', 'southbound_count', 'eastbound_count', 'westbound_count', 'total_count', 'file_no', 'date_count' ] worksheet = pd.read_excel(xlsx_file, sheet_name='TRAFFIC', header=None, skiprows=[0, 1, 2, 3], usecols=[8, 9, 10, 11, 12, 13, 14, 15, 16], names=names) # Write temp csv general.pos_write_csv(worksheet, out_csv_file, date_format=conf['date_format_ymd_hms']) return "Successfully cleaned traffic counts data."
def join_community_plan(): """Spatially joins community plan districts data to GID data.""" cp_geojson = conf['prod_data_dir'] + '/cmty_plan_datasd.geojson' gid_cp = spatial_join_pt(cd_file_gid, cp_geojson, lat='lat', lon='long') cols = gid_cp.columns.values.tolist() drop_cols = [ 'objectid', 'acreage', ] if "level_0" in cols: drop_cols.append('level_0') gid_cp = gid_cp.drop(drop_cols, axis=1) general.pos_write_csv( gid_cp, cp_file_gid, date_format='%Y-%m-%dT%H:%M:%S%z') return "Successfully joined community plan districts to GID data"
def process_billing(): """Process billing data.""" df = pd.read_csv( conf['temp_data_dir'] + '/' + datasd[0], low_memory=False, error_bad_lines=False, encoding='cp1252') df = df.drop('RentCode', 1) df = df.rename(columns={ 'LesseeName': 'lessee_name', 'RecordDate': 'billing_record_date', 'RecordType': 'line_type_calc', 'InvoiceNumber': 'invoice_number', 'PeriodCovered': 'period_covered', 'Amount': 'AR_line_amt_display', 'Status': 'line_status_calc', 'InvoiceDue': 'invoice_due_date' }) general.pos_write_csv( df, conf['prod_data_dir'] + '/' + datasd[0], date_format=conf['date_format_ymd_hms']) return 'Successfully processed billing data.'
def process_leases(): """Process leases data.""" df = pd.read_csv( conf['temp_data_dir'] + '/' + datasd[1], low_memory=False, error_bad_lines=False, encoding='cp1252') df = df.rename(columns={ 'SiteCode': 'site_code', 'LesseeName': 'lessee_name', 'LesseeCompany': 'lessee_company', 'LesseeDBA': 'lessee_DBA', 'LesseeZip': 'lessee_ZIP', 'LeaseType': 'lease_record_type', 'Description': 'lease_description', 'Status': 'lease_status', 'Location': 'lease_location_name', 'Nonprofit': 'nonprofit_lessee', 'EffectiveDate': 'effective_date', 'SchedTermination': 'sched_termination_date', 'BillingRentCode': 'rent_code', 'RentAmount': 'cost_line_amt_USD' }) general.pos_write_csv( df, conf['prod_data_dir'] + '/' + datasd[1], date_format=conf['date_format_ymd_hms']) return 'Successfully processed leases data.'
def join_parks(): """Spatially joins community plan districts data to GID data.""" parks_geojson = conf['prod_data_dir'] + '/parks_datasd.geojson' gid_parks = spatial_join_pt(cp_file_gid, parks_geojson, lat='lat', lon='long') cols = gid_parks.columns.values.tolist() drop_cols = [ 'objectid', 'gis_acres', 'location' ] if "level_0" in cols: drop_cols.append('level_0') gid_parks = gid_parks.drop(drop_cols, axis=1) gid_parks = gid_parks.rename(columns={'name':'park_name'}) general.pos_write_csv( gid_parks, parks_file_gid, date_format='%Y-%m-%dT%H:%M:%S%z') return "Successfully joined parks to GID data"
def clean_traffic_counts(src_fname='traffic_counts_file', out_fname='traffic_counts_raw_clean'): """Clean traffic counts data.""" xlsx_file = "{0}/{1}.xlsx"\ .format(conf['temp_data_dir'], src_fname) out_csv_file = "{0}/{1}.csv"\ .format(conf['temp_data_dir'], out_fname) names = ['street_name', 'limits', 'all_count', 'northbound_count', 'southbound_count', 'eastbound_count', 'westbound_count', 'total_count', 'file_no', 'count_date'] worksheet = pd.read_excel(xlsx_file, sheet_name='TRAFFIC', header=None, skiprows=[0, 1, 2, 3], usecols=[8, 9, 10, 11, 12, 13, 14, 15, 16, 17], names=names) # Write temp csv general.pos_write_csv( worksheet, out_csv_file, date_format=conf['date_format_ymd_hms']) return "Successfully cleaned traffic counts data."
def get_sidewalk_data(**kwargs): """Get sidewalk condition data from DB.""" sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__) sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql') df = sw_conn.get_pandas_df(sw_query) # Rename columns we're keeping df = df.rename( columns={ 'sap_id': 'seg_id', 'legacy_id': 'geojoin_id', 'inspectiondate': 'oci_date', 'rating': 'oci_desc', 'condition': 'oci' }) df = df.drop(['cgLastModified', 'MaxInspect', 'MaxMod'], axis=1) # Write csv logging.info('Writing ' + str(df.shape[0])) general.pos_write_csv(df, cond_file, date_format=conf['date_format_ymd']) return "Successfully wrote prod file"
def process_billing(): """Process billing data.""" df = pd.read_csv( conf['temp_data_dir'] + '/' + datasd[0], low_memory=False, error_bad_lines=False, encoding='cp1252') df = df.drop('RentCode', 1) df = df.rename(columns={ 'LesseeName': 'lessee_name', 'RecordDate': 'date_billing_record', 'RecordType': 'line_type_calc', 'InvoiceNumber': 'invoice_number', 'PeriodCovered': 'period_covered', 'Amount': 'AR_line_amt_display', 'Status': 'line_status_calc', 'InvoiceDue': 'date_invoice_due' }) general.pos_write_csv( df, conf['prod_data_dir'] + '/' + datasd[0], date_format=conf['date_format_ymd_hms']) return 'Successfully processed billing data.'
def process_leases(): """Process leases data.""" df = pd.read_csv( conf['temp_data_dir'] + '/' + datasd[1], low_memory=False, error_bad_lines=False, encoding='cp1252') df = df.rename(columns={ 'SiteCode': 'site_code', 'LesseeName': 'lessee_name', 'LesseeCompany': 'lessee_company', 'LesseeDBA': 'lessee_DBA', 'LesseeZip': 'address_zip', 'LeaseType': 'lease_record_type', 'Description': 'lease_description', 'Status': 'lease_status', 'Location': 'lease_location_name', 'Nonprofit': 'nonprofit_lessee', 'EffectiveDate': 'date_effective', 'SchedTermination': 'date_sched_termination', 'BillingRentCode': 'rent_code', 'RentAmount': 'cost_line_amt_USD' }) df['nonprofit_lessee'] = df['nonprofit_lessee'].fillna(0) general.pos_write_csv( df, conf['prod_data_dir'] + '/' + datasd[1], date_format=conf['date_format_ymd_hms']) return 'Successfully processed leases data.'
def clean_data(): """Get the permits file from temp directory, clean it, and save it in Prod directory""" df = pd.read_csv(temp_permits) df.columns = [x.lower() for x in df.columns] df['approval_issue_dt'] = pd.to_datetime( df['approval_issue_dt'], errors='coerce') df['approval_close_dt'] = pd.to_datetime( df['approval_close_dt'], errors='coerce') df['proj_appl_date'] = pd.to_datetime( df['proj_appl_date'], errors='coerce') df['proj_deemed_cmpl_date'] = pd.to_datetime( df['proj_deemed_cmpl_date'], errors='coerce') df = df.sort_values(by='approval_issue_dt') logging.info('Writing all permits') general.pos_write_csv( df, prod_permits, date_format=conf['date_format_ymd_hms']) return 'Successfully cleaned data.'
def get_sidewalk_data(**kwargs): """Get sidewalk condition data from DB.""" sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__) sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql') df = sw_conn.get_pandas_df(sw_query) # Rename columns we're keeping df = df.rename(columns={ 'sap_id': 'seg_id', 'legacy_id': 'geojoin_id', 'inspectiondate': 'oci_date', 'rating': 'oci_desc', 'condition': 'oci' }) df = df.drop(['cgLastModified', 'MaxInspect', 'MaxMod'],axis=1) # Write csv logging.info('Writing ' + str(df.shape[0])) general.pos_write_csv( df, cond_file, date_format=conf['date_format_ymd']) return "Successfully wrote prod file"
def join_council_districts(): """Spatially joins council districts data to GID data.""" cd_geojson = conf['prod_data_dir'] + '/council_districts_datasd.geojson' gid_cd = spatial_join_pt(ref_file_gid, cd_geojson, lat='lat', lon='long') cols = gid_cd.columns.values.tolist() drop_cols = [ 'objectid', 'area', 'perimeter', 'name', 'phone', 'website' ] if "level_0" in cols: drop_cols.append('level_0') gid_cd = gid_cd.drop(drop_cols, axis=1) gid_cd['district'] = gid_cd['district'].fillna('0') gid_cd['district'] = gid_cd['district'].astype(int) gid_cd['district'] = gid_cd['district'].astype(str) gid_cd['district'] = gid_cd['district'].replace('0', '') general.pos_write_csv( gid_cd, cd_file_gid, date_format='%Y-%m-%dT%H:%M:%S%z') return "Successfully joined council districts to GID data"
def update_referral_col(): """ Fill in missing referral values """ df = pd.read_csv(dates_file_gid, low_memory=False, parse_dates=['date_time_opened','date_time_closed'] ) df['referred'] = '' df.loc[df['referral_email_list'].notnull(), 'referred'] = df.loc[df['referral_email_list'].notnull(), 'referral_email_list'] df.loc[df['referred_department'].notnull(), 'referred'] = df.loc[df['referred_department'].notnull(), 'referred_department'] df.loc[df['display_referral_information'].notnull(), 'referred'] = df.loc[df['display_referral_information'].notnull(), 'display_referral_information'] general.pos_write_csv( df, ref_file_gid, date_format='%Y-%m-%dT%H:%M:%S%z') return "Successfully updated referral col"
def process_cfs_data(): """Update production data with new data.""" logging.info('Combining daily CFS files.') path = conf['temp_data_dir'] allFiles = glob.glob( os.path.join(path, f"calls_for_service_*_*_{curr_year}.csv")) np_array_list = [] for file_ in allFiles: df = pd.read_csv(file_, header=None, error_bad_lines=False, low_memory=False) np_array_list.append(df.as_matrix()) comb_np_array = np.vstack(np_array_list) temp_frame = pd.DataFrame(comb_np_array) dtypes = { 'address_number_primary': str, 'beat': str, 'priority': str, 'day_of_week': str } logging.info('Adding recent data to CFS production file.') curr_frame = pd.read_csv( f"{conf['prod_data_dir']}/pd_calls_for_service_{curr_year}_datasd.csv", parse_dates=['date_time'], dtype=dtypes) columns_names = curr_frame.columns.values temp_frame.columns = columns_names temp_frame['date_time'] = pd.to_datetime(temp_frame['date_time'], errors='coerce') for str_col in dtypes.keys(): temp_frame[str_col] = temp_frame[str_col].astype(str) prod_frame = curr_frame.append(temp_frame, ignore_index=True) prod_frame = prod_frame.drop_duplicates(subset=['incident_num']) prod_frame = prod_frame.sort_values(by='date_time', ascending=True) logging.info('Exporting updated CFS production data to csv.') prod_file = conf['prod_data_dir'] \ + '/pd_calls_for_service_' \ + curr_year \ + '_datasd.csv' general.pos_write_csv(prod_frame, prod_file, date_format=conf['date_format_ymd_hms']) return 'Successfully processed CFS data.'
def create_operating_act(): """ Use operating and ref sets to make operating dataset """ budgets = glob.glob(conf['temp_data_dir'] \ + "/FY*_FINAL_OM_ACTUALS.xlsx") for count, budget in enumerate(budgets): fy_pattern = re.compile(r'([0-9][0-9])') this_fy = fy_pattern.findall(budget) out_fname = prod_path \ + "/actuals_operating_FY{}_datasd.csv".format(this_fy[0]) df = pd.read_excel(budget) df = df.iloc[:, [0,1,2,3]] df.columns = ['amount','code','dept_number','commitment_item'] df['code'] = df['code'].astype(str) df['dept_number'] = df['dept_number'].astype(str) df['commitment_item'] = df['commitment_item'].astype(str) fund_ref = pd.read_csv(prod_path \ + "/budget_reference_funds_datasd.csv",dtype={'fund_number':str}) depts_ref = pd.read_csv(prod_path \ + "/budget_reference_depts_datasd.csv",dtype={'funds_center_number':str}) accounts_ref = pd.read_csv(prod_path \ + "/budget_reference_accounts_datasd.csv",dtype={'account_number':str}) df = pd.merge(df, fund_ref[['fund_type','fund_number']], left_on='code', right_on='fund_number', how='left') df = pd.merge(df, depts_ref[['dept_name','funds_center_number']], left_on='dept_number', right_on='funds_center_number', how='left') df = pd.merge(df, accounts_ref[['account','account_number']], left_on='commitment_item', right_on='account_number', how='left') df = df[['amount', 'fund_type', 'fund_number', 'dept_name', 'funds_center_number', 'account', 'account_number']] general.pos_write_csv(df,out_fname) return "Successfully created operating actuals"
def get_pts_violations(): """Get violations from pts, creates temp file.""" logging.info('Retrieving PTS violations.') db = cx_Oracle.connect(credentials) sql = general.file_to_string('./sql/pts_sw.sql', __file__) df = pd.read_sql_query(sql, db) general.pos_write_csv(df, temp_file_pts, date_format='%Y-%m-%dT%H:%M:%S%z') return "Successfully wrote {} records for dsd_pts violations file".format( df.shape[0])
def inventory_to_csv(): inventory_prod_path = conf['prod_data_dir'] + '/inventory_datasd_v1.csv' df = pd.read_csv( "https://docs.google.com/spreadsheets/d/e/2PACX-1vRaEHNs_h56ia6MSa-oTs22qAUjG9lD0t4Sqisq3G0swYRgp0DUoT83WE3mah4amCI0P3me9Bffxcqp/pub?gid=269959199&single=true&output=csv" ) df.columns = [ 'date_added', 'category', 'description', 'date_published', 'year_fy_target_pub' ] general.pos_write_csv(df, inventory_prod_path) return "Successfuly wrote inventory file to prod."
def subset_solar(): """ Creating subset of solar permits """ df = pd.read_csv(bid_permits) solar = df[df['approval_type_id'] == 293] general.pos_write_csv( solar, solar_permits, date_format=conf['date_format_ymd_hms']) return "Successfully subsetted solar permits"
def create_operating_act(): """ Use operating and ref sets to make operating dataset """ budgets = glob.glob(conf['temp_data_dir'] \ + "/FY*_FINAL_OM_ACTUALS.xlsx") for count, budget in enumerate(budgets): fy_pattern = re.compile(r'([0-9][0-9])') this_fy = fy_pattern.findall(budget) out_fname = prod_path \ + "/actuals_operating_FY{}_datasd_v1.csv".format(this_fy[0]) df = pd.read_excel(budget) df = df.iloc[:, [0, 1, 2, 3]] df.columns = ['amount', 'code', 'dept_number', 'commitment_item'] df['code'] = df['code'].astype(str) df['dept_number'] = df['dept_number'].astype(str) df['commitment_item'] = df['commitment_item'].astype(str) fund_ref = pd.read_csv(prod_path \ + "/budget_reference_funds_datasd_v1.csv",dtype={'fund_number':str}) depts_ref = pd.read_csv(prod_path \ + "/budget_reference_depts_datasd_v1.csv",dtype={'funds_center_number':str}) accounts_ref = pd.read_csv(prod_path \ + "/budget_reference_accounts_datasd_v1.csv",dtype={'account_number':str}) df = pd.merge(df, fund_ref[['fund_type', 'fund_number']], left_on='code', right_on='fund_number', how='left') df = pd.merge(df, depts_ref[['dept_name', 'funds_center_number']], left_on='dept_number', right_on='funds_center_number', how='left') df = pd.merge(df, accounts_ref[['account', 'account_number']], left_on='commitment_item', right_on='account_number', how='left') df = df[[ 'amount', 'fund_type', 'fund_number', 'dept_name', 'funds_center_number', 'account', 'account_number' ]] general.pos_write_csv(df, out_fname) return "Successfully created operating actuals"
def get_active_businesses(): """Query DB for 'Active Businesses' and save data to temp.""" logging.info('Retrieving business tax license data') db = cx_Oracle.connect(credentials) sql = general.file_to_string('./sql/ttcs_biz.sql', __file__) df = pd.read_sql_query(sql, db) df_rows = df.shape[0] logging.info('Query returned {} results'.format(df_rows)) general.pos_write_csv(df, temp_all, date_format=conf['date_format_ymd_hms']) return 'Successfully retrieved active businesses data.'
def join_bids(): """Spatially joins BIDs data to active businesses data.""" bids_geojson = conf['prod_data_dir'] + '/bids_datasd.geojson' active_bus_bid = geospatial.spatial_join_pt(geocoded_active, bids_geojson, lat='latitude', lon='longitude') general.pos_write_csv(active_bus_bid, bids_all, date_format=conf['date_format_ymd']) return "Successfully joined BIDs to active businesses"
def clean_data(): """Clean business license data coming from TTCS.""" logging.info('Reading query output') df = pd.read_csv(temp_all, low_memory=False, ) df.columns = [x.lower() for x in df.columns] logging.info('Creating NAICS sector') df['naics_sector'] = df['naics_code'].apply(lambda x: str(x)[:2]) logging.info('Extracting years for filter') df['bus_start_yr'] = pd.to_datetime( df['bus_start_dt'], errors='coerce').dt.year df['create_yr'] = pd.to_datetime( df['creation_dt'], errors='coerce').dt.year df_rows = df.shape[0] logging.info('Processed {} businesses'.format(df_rows)) logging.info('Sorting by dba name active date') df_sort = df.sort_values(['account_key', 'dba_name_dt', 'address_dt'], ascending=[True,False,False] ) logging.info('Deduping on account key, keeping latest dba and address') df_dedupe = df_sort.drop_duplicates(['account_key']) total_rows = df_dedupe.shape[0] logging.info('Deduped for {} total records'.format(total_rows)) df_dedupe = df_dedupe.sort_values(by=['account_key', 'creation_dt'], ascending=[True, False]) logging.info('Writing final data to csv') general.pos_write_csv( df_dedupe, clean_all, date_format=conf['date_format_ymd']) return 'Successfully cleaned TTCS data.'
def get_permits_files(): """Query DB for 'permits' and save data to temp directory.""" logging.info('Retrieving permits data.') db = cx_Oracle.connect(credentials) sql = general.file_to_string('./sql/pts.sql', __file__) sql += "WHERE a.issue_dt >= TO_DATE('"+str(year)+"-JAN-01', 'YYYY-MON-DD') AND a.issue_dt < TO_DATE('"+str(year+1)+"-JAN-01', 'YYYY-MON-DD')" df = pd.read_sql_query(sql, db) logging.info('Query returned {} results for {}'.format(df.shape[0],year)) general.pos_write_csv( df, temp_permits, date_format=conf['date_format_ymd_hms']) return 'Successfully retrieved permits data.'
def get_active_businesses(): """Query DB for 'Active Businesses' and save data to temp.""" logging.info('Retrieving business tax license data') db = cx_Oracle.connect(credentials) sql = general.file_to_string('./sql/ttcs_biz.sql', __file__) df = pd.read_sql_query(sql, db) df_rows = df.shape[0] logging.info('Query returned {} results'.format(df_rows)) general.pos_write_csv( df, temp_all, date_format=conf['date_format_ymd_hms']) return 'Successfully retrieved active businesses data.'
def join_bids(): """Spatially joins BIDs data to active businesses data.""" bids_geojson = conf['prod_data_dir'] + '/bids_datasd.geojson' active_bus_bid = geospatial.spatial_join_pt(geocoded_active, bids_geojson, lat='latitude', lon='longitude') general.pos_write_csv( active_bus_bid, bids_all, date_format=conf['date_format_ymd']) return "Successfully joined BIDs to active businesses"
def get_claims_data(): """Query an oracle database""" logging.info('Retrieving data from Oracle database') # This requires that otherwise optional credentials variable db = cx_Oracle.connect(credentials) # Create a sql file containing query for the database # Save this file in a sql folder at the same level as the jobs file sql = general.file_to_string('./sql/claimstat_tsw.sql', __file__) df = pd.read_sql_query(sql, db) logging.info(f'Query returned {df.shape[0]} results') general.pos_write_csv(df, "{}/claimstat_raw.csv".format(tmp)) return 'Successfully retrieved Oracle data.'
def get_streets_paving_data(): """Get streets paving data from DB.""" pv_query = general.file_to_string('./sql/pavement_ex.sql', __file__) pv_conn = MsSqlHook(mssql_conn_id='streets_cg_sql') df = pv_conn.get_pandas_df(pv_query) results = df.shape[0] general.pos_write_csv( df, temp_file) return f"Successfully wrote temp file with {results} records"
def get_special_events(): """Get special events from DB.""" se_query = general.file_to_string('./sql/special_events.sql', __file__) se_conn = MsSqlHook(mssql_conn_id='special_events_sql') df = se_conn.get_pandas_df(se_query) df['event_id'] = pd.to_numeric(df['event_id'], errors='coerce', downcast='integer') general.pos_write_csv(df, temp_file, date_format=conf['date_format_ymd_hms']) return "Retrieved special events to temp file."
def latest_res_ords(): """Get last decade from reso and ords table""" filename = 'documentum_scs_council_reso_ordinance_v.csv' save_path = f"{conf['prod_data_dir']}/documentum_scs_council_reso_ordinance_v" df = pd.read_csv(f"{conf['prod_data_dir']}/{filename}", low_memory=False) df['DOC_DATE'] = pd.to_datetime(df['DOC_DATE'],errors='coerce') df_current = df.loc[df['DOC_DATE'] >= f"01/01/2016"] general.pos_write_csv(df_current, f"{save_path}_2016_current.csv") logging.info(f"Wrote 2016_current") return f"Successfully extracted this decade of resos and ords"
def clean_data(): """Clean business license data coming from TTCS.""" logging.info('Reading query output') df = pd.read_csv( temp_all, low_memory=False, ) df.columns = [x.lower() for x in df.columns] logging.info('Creating NAICS sector') df['naics_sector'] = df['naics_code'].apply(lambda x: str(x)[:2]) logging.info('Extracting years for filter') df['bus_start_yr'] = pd.to_datetime(df['bus_start_dt'], errors='coerce').dt.year df['create_yr'] = pd.to_datetime(df['creation_dt'], errors='coerce').dt.year df_rows = df.shape[0] logging.info('Processed {} businesses'.format(df_rows)) logging.info('Sorting by dba name active date') df_sort = df.sort_values(['account_key', 'dba_name_dt', 'address_dt'], ascending=[True, False, False]) logging.info('Deduping on account key, keeping latest dba and address') df_dedupe = df_sort.drop_duplicates(['account_key']) total_rows = df_dedupe.shape[0] logging.info('Deduped for {} total records'.format(total_rows)) df_dedupe = df_dedupe.sort_values(by=['account_key', 'creation_dt'], ascending=[True, False]) logging.info('Writing final data to csv') general.pos_write_csv(df_dedupe, clean_all, date_format=conf['date_format_ymd']) return 'Successfully cleaned TTCS data.'
def build_aggregation(agg_type="pole_by_month", **kwargs): """Aggregate raw production data by month/day.""" out_fname = 'treas_meters_{0}_{1}_datasd_v1.csv'.format(cur_yr,agg_type) logging.info("Reading portal data " + portal_fname) portal = pd.read_csv(portal_fname) logging.info("Translate start_date to dt, create agg columns") portal['date_trans_start'] = pd.to_datetime( portal['date_trans_start'], format="%Y-%m-%d %H:%M:%S", errors='coerce') portal['month'] = portal.date_trans_start.dt.month portal['day'] = portal.date_trans_start.dt.day logging.info("Creating " + agg_type + " aggregation") if agg_type == 'pole_by_month': grouped = portal.groupby(['pole_id', 'month'], as_index=False) aggregation = grouped['trans_amt'].agg({ 'sum_trans_amt': 'sum', 'num_trans': 'count' }) elif agg_type == 'pole_by_mo_day': grouped = portal.groupby(['pole_id', 'month', 'day'], as_index=False) aggregation = grouped['trans_amt'].agg({ 'sum_trans_amt': 'sum', 'num_trans': 'count' }) else: raise NotImplementedError("Not sure what " + agg_type + " is") new_file_path = '{0}/{1}'.format(conf['prod_data_dir'],out_fname) logging.info("Writing " + agg_type + " aggregation") general.pos_write_csv( aggregation, new_file_path, date_format=conf['date_format_ymd_hms']) return "Updated agg " + agg_type + " file " + new_file_path
def build_aggregation(agg_type="pole_by_month", **kwargs): """Aggregate raw production data by month/day.""" out_fname = 'treas_meters_{0}_{1}_datasd.csv'.format(cur_yr,agg_type) logging.info("Reading portal data " + portal_fname) portal = pd.read_csv(portal_fname) logging.info("Translate start_date to dt, create agg columns") portal['trans_start'] = pd.to_datetime( portal['trans_start'], format="%Y-%m-%d %H:%M:%S", errors='coerce') portal['month'] = portal.trans_start.dt.month portal['day'] = portal.trans_start.dt.day logging.info("Creating " + agg_type + " aggregation") if agg_type == 'pole_by_month': grouped = portal.groupby(['pole_id', 'month'], as_index=False) aggregation = grouped['trans_amt'].agg({ 'sum_trans_amt': 'sum', 'num_trans': 'count' }) elif agg_type == 'pole_by_mo_day': grouped = portal.groupby(['pole_id', 'month', 'day'], as_index=False) aggregation = grouped['trans_amt'].agg({ 'sum_trans_amt': 'sum', 'num_trans': 'count' }) else: raise NotImplementedError("Not sure what " + agg_type + " is") new_file_path = '{0}/{1}'.format(conf['prod_data_dir'],out_fname) logging.info("Writing " + agg_type + " aggregation") general.pos_write_csv( aggregation, new_file_path, date_format=conf['date_format_ymd_hms']) return "Updated agg " + agg_type + " file " + new_file_path
def get_fd_data( **kwargs): """Get fire department data from Data Base.""" logging.info("Get fire department data from CAD archive") fd_query = general.file_to_string('./sql/fd.sql', __file__) fd_conn = MsSqlHook(mssql_conn_id='fire_department') logging.info("Read data to panda DataFrame") df = fd_conn.get_pandas_df(fd_query) # Write csv logging.info('Writing ' + prod_file) general.pos_write_csv( df, prod_file, date_format=conf['date_format_ymd_hms']) return "Successfully wrote prod file at " + prod_file
def get_fd_data(**kwargs): """Get fire department data from Data Base.""" logging.info("Get fire department data from CAD archive") fd_query = general.file_to_string('./sql/fd.sql', __file__) fd_conn = MsSqlHook(mssql_conn_id='fire_department') logging.info("Read data to panda DataFrame") df = fd_conn.get_pandas_df(fd_query) # Write csv logging.info('Writing ' + prod_file) general.pos_write_csv(df, prod_file, date_format=conf['date_format_ymd_hms']) return "Successfully wrote prod file at " + prod_file
def process_parcels(): """Process parcels data.""" df = pd.read_csv( conf['temp_data_dir'] + '/' + datasd[2], low_memory=False, error_bad_lines=False, encoding='cp1252') df = df.drop('Comments', 1) df = df.rename(columns={'SiteCode': 'site_code', 'ParcelNumber': 'APN-8'}) general.pos_write_csv( df, conf['prod_data_dir'] + '/' + datasd[2], date_format=conf['date_format_ymd_hms']) return 'Successfully processed parcels data.'
def update_dsd(key): """Add weekly data to current production data.""" y_src = conf['prod_data_dir'] + '/' + approval_dict[key][1] w_src = dsd_temp_dir + key + '_week.csv' ytd = pd.read_csv(y_src) week = pd.read_csv(w_src) prod = ytd.append(week, ignore_index=True) prod = prod.drop_duplicates(subset=['approval_id']) if key == 'applied': prod = prod.sort_values(by='application_date') elif key == 'issued': prod = prod.sort_values(by='issue_date') elif key == 'completed': prod = prod.sort_values(by='complete_cancel_date') general.pos_write_csv(prod, y_src, date_format=conf['date_format_ymd_hms']) return 'Successfully updated ' + key + 'permits.'
def extract_solar(key): """Extract solar permits from production files.""" prod_src = conf['prod_data_dir'] + '/' + approval_dict[key][1] solar_pmts = conf['prod_data_dir'] + '/' + 'solar_permits_' + key + '_ytd_datasd.csv' ytd = pd.read_csv(prod_src) solar = ytd[ytd['approval_type_id'] == 293] if key == 'applied': solar = solar.sort_values(by='application_date') elif key == 'issued': solar = solar.sort_values(by='issue_date') elif key == 'completed': solar = solar.sort_values(by='complete_cancel_date') general.pos_write_csv(solar, solar_pmts, date_format=conf['date_format_ymd_hms']) return 'Successfully updated ' + key + ' solar permits.'
def join_bids(): """ Spatially joins permits to Business Improvement Districts. """ bids_geojson = conf['prod_data_dir'] + '/bids_datasd.geojson' bids_join = spatial_join_pt(prod_permits, bids_geojson, lat='lat_job', lon='lng_job') bids_join = bids_join.drop(['objectid', 'long_name', 'status', 'link'], axis=1) bids_join = bids_join.rename(columns={'name': 'bid_name'}) general.pos_write_csv(bids_join, bid_permits, date_format='%Y-%m-%dT%H:%M:%S%z') return 'Successfully joined permits to BIDs'
def process_cfs_data(): """Update production data with new data.""" logging.info('Combining daily CFS files.') path = conf['temp_data_dir'] allFiles = glob.glob(os.path.join(path, "calls*.csv")) np_array_list = [] for file_ in allFiles: df = pd.read_csv(file_, header=None, error_bad_lines=False, low_memory=False) np_array_list.append(df.as_matrix()) comb_np_array = np.vstack(np_array_list) temp_frame = pd.DataFrame(comb_np_array) logging.info('Adding recent data to CFS production file.') curr_frame = pd.read_csv( conf['prod_data_dir'] + '/pd_calls_for_service_' + curr_year + '_datasd.csv') columns_names = curr_frame.columns.values temp_frame.columns = columns_names prod_frame = curr_frame.append(temp_frame, ignore_index=True) prod_frame = prod_frame.drop_duplicates(subset=['incident_num']) prod_frame['date_time'] = pd.to_datetime(prod_frame['date_time']) prod_frame['day'] = prod_frame['day'].astype(int) prod_frame['stno'] = prod_frame['stno'].astype(int) prod_frame['beat'] = pd.to_numeric(prod_frame['beat'], errors='coerce') prod_frame['priority'] = pd.to_numeric( prod_frame['priority'], errors='coerce') prod_frame = prod_frame.sort_values(by='date_time', ascending=True) logging.info('Exporting updated CFS production data to csv.') prod_file = conf['prod_data_dir'] \ + '/pd_calls_for_service_' \ + curr_year \ + '_datasd.csv' general.pos_write_csv( prod_frame, prod_file, date_format=conf['date_format_ymd_hms']) return 'Successfully processed CFS data.'
def get_cip_data(**kwargs): """Get CIP data from Data Base.""" logging.info("Get CIP data from Oracle DataBase") cip_query = general.file_to_string('./sql/cip.sql', __file__) cip_conn = cx_Oracle.connect(credentials) logging.info("Read data to Panda DataFrame") df = pd.read_sql_query(cip_query, cip_conn) rows = df.shape[0] # Write csv logging.info('Writing {} rows to prod'.format(rows)) general.pos_write_csv(df, prod_file, date_format=conf['date_format_ymd_hms']) return "Successfully wrote prod file"
def get_documentum(mode, **kwargs): """Get tables from Documentum.""" logging.info('Getting files from documentum') table_name = dn.table_name(mode) for name in table_name: logging.info('Querying for {0} table'.format(name)) query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name) logging.info('Connecting to MS Database') documentum_conn = MsSqlHook(mssql_conn_id='docm_sql') logging.info('Reading data to Pandas DataFrame') df = documentum_conn.get_pandas_df(query_string) logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower()) logging.info('Writing Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved Documentum tables"
def process_collisions_data(): """Process collision data.""" prod_file = conf['prod_data_dir']+'/pd_collisions_datasd.csv' df = pd.read_csv(conf['temp_data_dir']+'/temp_collisions.csv', header=None, error_bad_lines=False) df.columns = [ 'report_id', 'date_time', 'police_beat', 'street_no', 'street_dir', 'street_name', 'street_type', 'cross_st_dir', 'cross_st_name', 'cross_st_type', 'violation_section', 'violation_type', 'charge_desc', 'injured', 'killed', 'hit_run_lvl' ] general.pos_write_csv( df, prod_file, date_format=conf['date_format_ymd_hms'] ) return 'Successfully processed collisions data.'
def get_cip_data(**kwargs): """Get CIP data from Data Base.""" logging.info("Get CIP data from Oracle DataBase") cip_query = general.file_to_string('./sql/cip.sql', __file__) cip_conn= cx_Oracle.connect(credentials) logging.info("Read data to Panda DataFrame") df = pd.read_sql_query(cip_query, cip_conn) rows = df.shape[0] # Write csv logging.info('Writing {} rows to prod'.format(rows)) general.pos_write_csv( df, prod_file, date_format=conf['date_format_ymd_hms']) return "Successfully wrote prod file"
def join_community_plan(): """Spatially joins community plan districts data to GID data.""" cp_geojson = conf['prod_data_dir'] + '/cmty_plan_datasd.geojson' gid_cp = spatial_join_pt(cd_file_gid, cp_geojson, lat='lat', lon='long') cols = gid_cp.columns.values.tolist() drop_cols = [ 'objectid', 'acreage', ] if "level_0" in cols: drop_cols.append('level_0') gid_cp = gid_cp.drop(drop_cols, axis=1) general.pos_write_csv(gid_cp, cp_file_gid, date_format='%Y-%m-%dT%H:%M:%S%z') return "Successfully joined community plan districts to GID data"
def split_reso_ords(): """Split largest table of reso and ords""" filename = 'documentum_scs_council_reso_ordinance_v.csv' save_path = f"{conf['prod_data_dir']}/documentum_scs_council_reso_ordinance_v" df = pd.read_csv(f"{conf['prod_data_dir']}/{filename}", low_memory=False) total_records = df.shape[0] record_count = 0 logging.info(f"Dividing {total_records} records") df['DOC_DATE'] = pd.to_datetime(df['DOC_DATE'],errors='coerce') div_years = [1976,1986,1996,2006,2016] for i,year in enumerate(div_years): if i == 0: sub_div = df.loc[df['DOC_DATE'] < f"01/01/{year}"] general.pos_write_csv(sub_div, f"{save_path}_begin_{year-1}.csv") logging.info(f"Wrote begin_{year-1}") record_count += sub_div.shape[0] else: sub_div = df.loc[(df['DOC_DATE'] < f"01/01/{year}") & (df['DOC_DATE'] >= f"01/01/{div_years[i-1]}")] general.pos_write_csv(sub_div, f"{save_path}_{div_years[i-1]}_{year-1}.csv") logging.info(f"Wrote {div_years[i-1]}_{year-1}") record_count += sub_div.shape[0] df_invalid = df.loc[df['DOC_DATE'].isnull()] general.pos_write_csv(df_invalid, f"{save_path}_invalid.csv") logging.info("Wrote records with invalid date") record_count += df_invalid.shape[0] return f"Successfully divided {record_count} from {filename}"
def get_sf_violations(): """Get violations from sf, creates temp file.""" username = conf['dpint_sf_user'] password = conf['dpint_sf_pass'] security_token = conf['dpint_sf_token'] report_id = "00Ot0000000TPXC" # Init salesforce client sf = Salesforce(username, password, security_token) # Pull dataframe logging.info('Pull report {} from SF'.format(report_id)) df = sf.get_report_df(report_id) logging.info('Process report {} data.'.format(report_id)) general.pos_write_csv(df, temp_file_sf, date_format='%Y-%m-%dT%H:%M:%S%z') return "Successfully wrote {} records for tsw_sf violations file".format( df.shape[0])
def get_onbase(): """Get tables from OnBase.""" logging.info('Getting files from onbase') for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'): for name in files: logging.info('Querying for '+name) path = './sql/onbase/{}'.format(name) query_string = general.file_to_string(path, __file__) logging.info('Connecting to MS Database') onbase_conn = MsSqlHook(mssql_conn_id='onbase_sql') logging.info('Reading data to Pandas DataFrame') df = onbase_conn.get_pandas_df(query_string) table_type = name[0:-4] logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = '{0}/onbase_{1}.csv'.format(conf['prod_data_dir'],table_type) logging.info('Writting Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved OnBase tables"
def clean_data(): """Get the permits file from temp directory, clean it, and save it in Prod directory""" filename = conf['temp_data_dir'] + "/*Panda_Extract_PermitActivities*.txt" list_of_files = glob.glob(filename) latest_file = max(list_of_files, key=os.path.getmtime) logging.info(f"Reading in {latest_file}") df = pd.read_table(latest_file, sep=",", encoding="ISO-8859-1") df.columns = [x.lower() for x in df.columns] final_cols = [ "approval_id", "approval_type_id", "short_desc", "approval_type", "appr_proc_code", "cat_code", "authority", "appl_days", "approval_status", "date_approval_issue", "date_approval_close", "job_id", "proj_id", "devel_id", "proj_title", "proj_scope", "proj_job_order", "date_proj_appl", "date_proj_comp", "lng_job", "lat_job", "job_apn", "address", "com_plan_id", "com_plan", "cust_name", "valuation", "stories", "units", "floorareas", "bc_group" ] df = df.rename( columns={ 'job_lat': 'lat_job', 'job_lng': 'lng_job', 'job_address': 'address', 'approval_issue_dt': 'date_approval_issue', 'approval_close_dt': 'date_approval_close', 'proj_appl_date': 'date_proj_appl', 'proj_deemed_cmpl_date': 'date_proj_comp' }) df_final = df[final_cols] general.pos_write_csv(df_final, prod_permits, date_format=conf['date_format_ymd_hms']) return 'Successfully cleaned data.'
def get_fd_data(**kwargs): """Get fire department data from Data Base.""" logging.info("Get fire department data from Data Base") fd_query = general.file_to_string('./sql/fd_pn.sql', __file__) fd_conn = MsSqlHook(mssql_conn_id='fire_department') logging.info("Read data to panda DataFrame") df = fd_conn.get_pandas_df(fd_query) df = df.rename( columns={ 'city': 'address_city', 'response_month': 'month_response', 'response_year': 'year_response' }) logging.info("Writing {} rows to prod".format(df.shape[0])) # Write csv general.pos_write_csv(df, prod_file) return "Successfully wrote prod file"
def get_requests_service_name(service_name, machine_service_name): """Create GID files by service type.""" gid_csv = services_file gid = pd.read_csv(gid_csv, low_memory=False) data = gid.loc[gid['service_name'].str.contains(service_name,na=False), :].copy() if data.shape[0] == 0: raise ValueError("{} is not a valid service name".format(service_name)) data = data.reset_index() if service_name != 'Illegal Dumping': data = data.drop(['index','specify_the_issue'], axis=1) else: data = data.drop(['index'], axis=1) out_path = prod_file_base + machine_service_name + '_' + prod_file_end general.pos_write_csv(data, out_path, date_format='%Y-%m-%dT%H:%M:%S%z') return "Successfully wrote {} records for gid {} prod file".format( data.shape[0], machine_service_name)
def get_sf_violations(): """Get violations from sf, creates temp file.""" username = conf['mrm_sf_user'] password = conf['mrm_sf_pass'] security_token = conf['mrm_sf_token'] report_id = "00Ot0000000TPXC" # Init salesforce client sf = Salesforce(username, password, security_token) # Pull dataframe logging.info('Pull report {} from SF'.format(report_id)) df = sf.get_report_df(report_id) logging.info('Process report {} data.'.format(report_id)) general.pos_write_csv(df, temp_file_sf, date_format='%Y-%m-%dT%H:%M:%S%z') return "Successfully wrote {} records for tsw_sf violations file".format( df.shape[0])
def get_onbase_test(): """Get tables from OnBase.""" logging.info('Getting files from onbase') for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'): for name in files: logging.info('Querying for '+name) path = './sql/onbase/{}'.format(name) query_string = general.file_to_string(path, __file__) logging.info('Connecting to MS Database') onbase_conn = MsSqlHook(mssql_conn_id='onbase_test_sql') logging.info('Reading data to Pandas DataFrame') df = onbase_conn.get_pandas_df(query_string) table_type = name[0:-4] logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = '{0}/onbase_test_{1}.csv'.format(conf['prod_data_dir'],table_type) logging.info('Writting Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved OnBase tables"
def process_properties_details(): """Process properties details data.""" df = pd.read_csv( conf['temp_data_dir'] + '/' + datasd[3], low_memory=False, error_bad_lines=False, encoding='cp1252') df = df.rename(columns={ 'SiteCode': 'site_code', 'FileCode': 'file_code', 'Grantor': 'grantor', 'MonthAcquired': 'month_acquired', 'YearAcquired': 'year_acquired', 'PurchaseFund': 'purchase_fund', 'LandCost': 'land_cost', 'BldgCost': 'building_cost', 'ClosingCost': 'closing_cost', 'SiteName': 'site_name', 'ManagingGroup': 'managing_group', 'ManagingDept': 'managing_dept', 'DesignatedUse': 'designated_use', 'SiteAcres': 'site_acres', 'SubsiteAcres': 'file_acres', 'SubsiteOriginalAcres': 'original_acres', 'DedicatedPark': 'dedicated_park', 'WaterUse': 'water_use', 'UseRestrictions': 'use_restrictions', 'ResOrOrd': 'desig_reso_ord', 'ResOrOrdDate': 'reso_ord_date' }) general.pos_write_csv( df, conf['prod_data_dir'] + '/' + datasd[3], date_format=conf['date_format_ymd_hms']) return 'Successfully processed properties details data.'
def process_properties_details(): """Process properties details data.""" df = pd.read_csv( conf['temp_data_dir'] + '/' + datasd[3], low_memory=False, error_bad_lines=False, encoding='cp1252') df = df.rename(columns={ 'SiteCode': 'site_code', 'FileCode': 'file_code', 'Grantor': 'grantor', 'MonthAcquired': 'month_acquired', 'YearAcquired': 'year_acquired', 'PurchaseFund': 'purchase_fund', 'LandCost': 'land_cost', 'BldgCost': 'building_cost', 'ClosingCost': 'closing_cost', 'SiteName': 'site_name', 'ManagingGroup': 'managing_group', 'ManagingDept': 'managing_dept', 'DesignatedUse': 'designated_use', 'SiteAcres': 'site_acres', 'SubsiteAcres': 'file_acres', 'SubsiteOriginalAcres': 'original_acres', 'DedicatedPark': 'dedicated_park', 'WaterUse': 'water_use', 'UseRestrictions': 'use_restrictions', 'ResOrOrd': 'desig_reso_ord', 'ResOrOrdDate': 'date_reso_ord' }) general.pos_write_csv( df, conf['prod_data_dir'] + '/' + datasd[3], date_format=conf['date_format_ymd_hms']) return 'Successfully processed properties details data.'
def combine_all_schedules(): """ Transactions combined into one file for year-to-date """ schedule_460a = pd.read_csv(conf['temp_data_dir'] + '/schedule_460a.csv') schedule_460b1 = pd.read_csv(conf['temp_data_dir'] + '/schedule_460b1.csv') schedule_460c = pd.read_csv(conf['temp_data_dir'] + '/schedule_460c.csv') schedule_460d = pd.read_csv(conf['temp_data_dir'] + '/schedule_460d.csv') schedule_460sum = pd.read_csv(conf['temp_data_dir'] + '/schedule_460sum.csv') form_497 = pd.read_csv(conf['temp_data_dir'] + '/schedule_497.csv') form_496 = pd.read_csv(conf['temp_data_dir'] + '/schedule_496.csv') outputDF = pd.concat([ schedule_460a, schedule_460b1, schedule_460c, schedule_460d, schedule_460sum, form_497, form_496 ], ignore_index=True) general.pos_write_csv(outputDF, prod_file, date_format='%Y-%m-%dT%H:%M:%S%z') return "Created prod file"
def join_bids(): """ Spatially joins permits to Business Improvement Districts. """ bids_geojson = conf['prod_data_dir'] + '/bids_datasd.geojson' bids_join = spatial_join_pt(prod_permits, bids_geojson, lat='job_lat', lon='job_lng') bids_join = bids_join.drop(['objectid', 'long_name', 'status', 'link' ], axis=1) bids_join = bids_join.rename(columns={'name':'bid_name'}) general.pos_write_csv( bids_join, bid_permits, date_format='%Y-%m-%dT%H:%M:%S%z') return 'Successfully joined permits to BIDs'
def get_requests_service_name(service_name, machine_service_name): """Create GID files by service type.""" gid_csv = services_file gid = pd.read_csv(gid_csv, low_memory=False) data = gid.loc[ gid['service_name'].str.contains(service_name, na=False), :].copy() if data.shape[0] == 0: raise ValueError("{} is not a valid service name".format(service_name)) data = data.reset_index() if service_name != 'Illegal Dumping': data = data.drop(['index', 'specify_the_issue'], axis=1) else: data = data.drop(['index'], axis=1) out_path = prod_file_base + machine_service_name + '_' + prod_file_end general.pos_write_csv(data, out_path, date_format='%Y-%m-%dT%H:%M:%S%z') return "Successfully wrote {} records for gid {} prod file".format( data.shape[0], machine_service_name)
def join_parks(): """Spatially joins community plan districts data to GID data.""" parks_geojson = conf['prod_data_dir'] + '/parks_datasd.geojson' gid_parks = spatial_join_pt(cp_file_gid, parks_geojson, lat='lat', lon='long') cols = gid_parks.columns.values.tolist() drop_cols = ['objectid', 'gis_acres', 'location'] if "level_0" in cols: drop_cols.append('level_0') gid_parks = gid_parks.drop(drop_cols, axis=1) gid_parks = gid_parks.rename(columns={'name': 'park_name'}) general.pos_write_csv(gid_parks, parks_file_gid, date_format='%Y-%m-%dT%H:%M:%S%z') return "Successfully joined parks to GID data"
def build_traffic_counts(src_fname='traffic_counts_raw_clean', out_fname='traffic_counts_datasd'): """Build traffic counts production data.""" src_file = "{0}/{1}.csv"\ .format(conf['temp_data_dir'], src_fname) out_file = "{0}/{1}.csv"\ .format(conf['prod_data_dir'], out_fname) # read in csv from temp counts = pd.read_csv(src_file) # remove rows that are part of the main worksheet but empty for some reason counts = counts[counts['street_name'] != ' '] # date type counts['count_date'] = pd.to_datetime(counts['count_date']) # create id field based on file id and street counts['id'] = counts.street_name.str.cat(counts.file_no, sep="")\ .str.replace(" ", "")\ .str.replace("-", "") # reorder columns cols = counts.columns.tolist() cols = cols[-1:] + cols[:-1] counts = counts[cols] # write to production file new_file_path = out_file general.pos_write_csv( counts, new_file_path, date_format=conf['date_format_ymd_hms']) return "Successfully built traffic counts production file."