def readDMV(): '''read in DMV data tables''' crash = moda.databridge("select * from anaphi.crash", encrypted='yes') ind = moda.databridge("select * from anaphi.individual", encrypted='yes') veh = moda.databridge("select * from anaphi.vehicle", encrypted='yes') # header got repreated in a couple of the files, removing it crash.drop(133, inplace=True) ind.drop(484, inplace=True) # reformating date crash['year'] = crash.CSACCDTE.str[:4] crash['day'] = crash.CSACCDTE.str[8:10] crash['month'] = crash.CSACCDTE.str[5:7] crash['date'] = pd.to_datetime(crash[['year', 'month', 'day']]) # reformating age and vehicle type to number veh.VEHBDYT_ID = pd.to_numeric(veh.VEHBDYT_ID, errors='coearse') ind.INDIV_AGE = pd.to_numeric(ind.INDIV_AGE, errors='coearse') print 'full crash table', crash.shape print 'full person table', ind.shape print 'full vehicle table', veh.shape return crash, ind, veh
def gatherConst(): ''' query contsruction data, 2011-2014 return alt 1-3 jobs on multiple floors ''' query = ''' SELECT J_BORO||J_BLOCK||J_LOT AS BBL, J_BIN_NUMBER as BIN, J_PRE_FILING_DATE, J_FLOOR, J_JOB_TYPE_DESC FROM anafic.wc_dob_job_f WHERE J_PRE_FILING_DATE >= TO_DATE('2007-01-01' , 'YYYY-MM-DD HH24:MI:SS') AND J_PRE_FILING_DATE < TO_DATE('2016-01-01' , 'YYYY-MM-DD HH24:MI:SS') ''' jobs = moda.databridge(query) jobs.drop_duplicates(inplace=True) # extracting out the year from the job filing date jobs['jobYear'] = jobs.J_PRE_FILING_DATE.apply(lambda x: x.year) # converting to ints jobs.BBL = pd.to_numeric(jobs.BBL,errors='coersive') jobs.BIN = pd.to_numeric(jobs.BIN,errors='coersive') jobs = jobs[jobs.BBL>1] jobs = jobs[jobs.BIN>1] # only keeping ALT 1-3 altjobs = jobs[jobs['J_JOB_TYPE_DESC'].str.contains('Alteration')] # only keeping jobs with work on multiple floors (J_FLOOR contains a comma or dash) jobsMultFl = altjobs.fillna('0')[altjobs.fillna('0')['J_FLOOR'].str.contains('-|,')] print 'jobs',jobs.shape return jobsMultFl
def gatherSale(): '''query DOF sale, change of ownership data, 2011-2014, no 1 or 2 family homes''' query = ''' SELECT DOF_OWNR_NAME_UPDTD, BBL FROM anafic.wc_dof_ownership_f LEFT OUTER JOIN anafic.wc_location_d ON wc_dof_ownership_f.location_wid = wc_location_d.row_wid WHERE dof_ownr_name_updtd <= sysdate --don't include dates from dec 31 9999 AND dof_ownr_name_updtd >= TO_DATE('2011-01-01' , 'YYYY-MM-DD HH24:MI:SS') AND dof_ownr_name_updtd < TO_DATE('2015-01-01' , 'YYYY-MM-DD HH24:MI:SS') AND BBL > 0 --to remove null BBLs AND bldg_class NOT LIKE 'A%' --to remove single family homes AND bldg_class NOT LIKE 'B%' --to remove 2 family homes''' own = moda.databridge(query) # convert to ints own.BBL = pd.to_numeric(own.BBL,errors='coersive') # extracting out the year from the sales date own['saleYear'] = pd.to_datetime(own.DOF_OWNR_NAME_UPDTD).apply(lambda x: x.year) own.drop_duplicates(inplace=True) print 'DOF',own.shape return own
def gather311(): ''' returns 311 complaints tied to a BBL from 2011 to 2014''' query = ''' with complaints as (select upper(org.LVL8ANC_NAME) as agency, srvreq_d.X_BBL as BBL, srvreq_d.X_BIN as BIN, srvreq_md.TYPE_CD_I, srvreq_md.AREA, srvreq_md.SUB_AREA, srvreq_d.SR_NUM, day_d.DAY_DT, extract(year from day_dt) as YEAR_DT from (SELECT * FROM anaprd.W_ORG_DH WHERE TOP_LVL_NAME <> 'DSNY' and (X_AGENCY_ACRONYM IS NOT NULL OR TOP_LVL_ID <> LVL8ANC_ID OR FIXED_HIER_LEVEL <> 9)) org, anaprd.W_SRVREQ_MD srvreq_md, anaprd.W_DAY_D day_d, anaprd.W_SRVREQ_D srvreq_d, anaprd.W_SRVREQ_F srvreq_f where ( org.ROW_WID = srvreq_f.ACCNT_WID and srvreq_d.ROW_WID = srvreq_f.SR_WID and srvreq_d.integration_id = srvreq_f.integration_id and srvreq_f.OPEN_DT_WID = day_d.ROW_WID and srvreq_f.X_SR_ATTR_WID = srvreq_md.ROW_WID and day_d.ROW_WID >= 20110101.0 and day_d.ROW_WID < 20150101.0 ) ) select BBL, BIN, agency, type_cd_i, area, sub_area, count(sr_num) as count, year_dt from complaints group by BBL, BIN, agency, type_cd_i,area,sub_area, year_dt ''' complaints = moda.databridge(query) print '311 initial pull',complaints.shape #convert to float and drop non positive BBLs complaints.BBL = pd.to_numeric(complaints.BBL,errors='coersive') complaints = complaints[complaints.BBL>0] print '311 after dropping non positive BBLs', complaints.shape complaints.BIN = pd.to_numeric(complaints.BIN,errors='coersive') complaints.fillna('-',inplace=True) return complaints
def readLinked(): ''' read in linked(matched) DMV/SPARKS data''' linked = moda.databridge('select * from anaphi.dohmh_traffic_dot_moda2', encrypted='yes') # some col names end with underscore, removing them. linked.columns = [x[:-1] if x[-1]=='_' else x for x in linked.columns ] print 'linked',linked.shape # dropping anything without a police report linked.drop(linked[linked.POL_REPT=='N'].index,inplace=True) # dropping anything with injury type = 15-18 since that corresponds to the civilian form linked.drop(index = linked[linked.INJT_ID.isin(['15','16','17','18'])].index, inplace=True) linked.reset_index(drop=True,inplace=True) print 'linked after dropping no police reports', linked.shape return linked
def gatherDOBcomp(): ''' DOB complaints 2011 to 2014 ''' query = ''' select * from anafic.wc_dob_complaints_f where DATE_ENTERED >= to_date('2011-01-01','YYYY-MM-DD') and DATE_ENTERED < to_date('2015-01-01','YYYY-MM-DD') ''' dob = moda.databridge(query) print 'dob complaints',dob.shape # extract the year of the complaint dob['DOBYear'] = dob['DATE_ENTERED'].apply(lambda x: x.year) # change to date format dob['INSPECTION_DATE'] = pd.to_datetime(dob['INSPECTION_DATE'],errors='coerce') #combine to get BBL dob['BBL']=dob.BORO+dob.BLOCK+dob.LOT #convert BIN,BBL to numbers dob['BBL'] = pd.to_numeric(dob.BBL,errors='coersive') dob['BIN'] = pd.to_numeric(dob.BIN,errors='coersive') # illegal work complaints dobIllWorkCat = ['05','83','86','66','71','90','12','5G','76','5A','3A'] dobIllWork = dob[dob['COMPLAINT_CATEGORY'].isin(dobIllWorkCat)] dobIllWork.groupby(['COMPLAINT_CATEGORY','COMPLAINT_CATEGORY_DESCRIPTION']).count()[['BBL']] # drop complaints where the last disposition code states no violation was found dropcodes = ['I2','XX','I1','H1'] dobIWC = dobIllWork[~dobIllWork.LAST_DISPOSITION_CODE.isin(dropcodes)] print 'dob illegal work',dobIWC.shape return dobIWC