Ejemplo n.º 1
0
def readDMV():
    '''read in DMV data tables'''
    crash = moda.databridge("select * from anaphi.crash", encrypted='yes')
    ind = moda.databridge("select * from anaphi.individual", encrypted='yes')
    veh = moda.databridge("select * from anaphi.vehicle", encrypted='yes')

    # header got repreated in a couple of the files, removing it
    crash.drop(133, inplace=True)
    ind.drop(484, inplace=True)

    # reformating date
    crash['year'] = crash.CSACCDTE.str[:4]
    crash['day'] = crash.CSACCDTE.str[8:10]
    crash['month'] = crash.CSACCDTE.str[5:7]
    crash['date'] = pd.to_datetime(crash[['year', 'month', 'day']])

    # reformating age and vehicle type to number
    veh.VEHBDYT_ID = pd.to_numeric(veh.VEHBDYT_ID, errors='coearse')
    ind.INDIV_AGE = pd.to_numeric(ind.INDIV_AGE, errors='coearse')

    print 'full crash table', crash.shape
    print 'full person table', ind.shape
    print 'full vehicle table', veh.shape

    return crash, ind, veh
Ejemplo n.º 2
0
def gatherConst():
    ''' query contsruction data, 2011-2014
    return alt 1-3 jobs on multiple floors '''
    query = '''
    SELECT J_BORO||J_BLOCK||J_LOT AS BBL, 
        J_BIN_NUMBER as BIN, 
        J_PRE_FILING_DATE, 
        J_FLOOR,
        J_JOB_TYPE_DESC
    FROM anafic.wc_dob_job_f
    WHERE J_PRE_FILING_DATE >= TO_DATE('2007-01-01' , 'YYYY-MM-DD HH24:MI:SS')
    AND J_PRE_FILING_DATE < TO_DATE('2016-01-01' , 'YYYY-MM-DD HH24:MI:SS')
    '''

    jobs = moda.databridge(query)
    jobs.drop_duplicates(inplace=True)

    # extracting out the year from the job filing date
    jobs['jobYear'] = jobs.J_PRE_FILING_DATE.apply(lambda x: x.year)

    # converting to ints
    jobs.BBL = pd.to_numeric(jobs.BBL,errors='coersive')
    jobs.BIN = pd.to_numeric(jobs.BIN,errors='coersive')
    jobs = jobs[jobs.BBL>1]
    jobs = jobs[jobs.BIN>1]

    # only keeping ALT 1-3
    altjobs = jobs[jobs['J_JOB_TYPE_DESC'].str.contains('Alteration')]

    # only keeping jobs with work on multiple floors (J_FLOOR contains a comma or dash)
    jobsMultFl = altjobs.fillna('0')[altjobs.fillna('0')['J_FLOOR'].str.contains('-|,')]
    print 'jobs',jobs.shape
    return jobsMultFl
Ejemplo n.º 3
0
def gatherSale():
    '''query DOF sale, change of ownership data, 2011-2014, no 1 or 2 family homes'''
    query = '''
    SELECT DOF_OWNR_NAME_UPDTD, BBL
    FROM anafic.wc_dof_ownership_f LEFT OUTER JOIN anafic.wc_location_d 
        ON wc_dof_ownership_f.location_wid = wc_location_d.row_wid
    WHERE dof_ownr_name_updtd <= sysdate --don't include dates from dec 31 9999
    AND dof_ownr_name_updtd >= TO_DATE('2011-01-01' , 'YYYY-MM-DD HH24:MI:SS')
    AND dof_ownr_name_updtd < TO_DATE('2015-01-01' , 'YYYY-MM-DD HH24:MI:SS')
    AND BBL > 0 --to remove null BBLs
    AND bldg_class NOT LIKE 'A%' --to remove single family homes
    AND bldg_class NOT LIKE 'B%' --to remove 2 family homes'''

    own = moda.databridge(query)

    # convert to ints
    own.BBL = pd.to_numeric(own.BBL,errors='coersive')

    # extracting out the year from the sales date
    own['saleYear'] = pd.to_datetime(own.DOF_OWNR_NAME_UPDTD).apply(lambda x: x.year)

    own.drop_duplicates(inplace=True)

    print 'DOF',own.shape
    return own
Ejemplo n.º 4
0
def gather311():
    ''' returns 311 complaints tied to a BBL from 2011 to 2014'''
    query = '''
    with complaints as 
    (select 
     upper(org.LVL8ANC_NAME) as agency,
     srvreq_d.X_BBL as BBL,
     srvreq_d.X_BIN as BIN,
     srvreq_md.TYPE_CD_I,
     srvreq_md.AREA,
     srvreq_md.SUB_AREA,
     srvreq_d.SR_NUM,
     day_d.DAY_DT,
     extract(year from day_dt) as YEAR_DT

    from 
     (SELECT * FROM anaprd.W_ORG_DH WHERE TOP_LVL_NAME <> 'DSNY' 
        and (X_AGENCY_ACRONYM IS NOT NULL OR TOP_LVL_ID <> LVL8ANC_ID OR FIXED_HIER_LEVEL <> 9)) org,
     anaprd.W_SRVREQ_MD srvreq_md,
     anaprd.W_DAY_D day_d,
     anaprd.W_SRVREQ_D srvreq_d, 
     anaprd.W_SRVREQ_F srvreq_f
     
    where  (
         org.ROW_WID = srvreq_f.ACCNT_WID and 
         srvreq_d.ROW_WID = srvreq_f.SR_WID and
         srvreq_d.integration_id = srvreq_f.integration_id and
         srvreq_f.OPEN_DT_WID = day_d.ROW_WID and 
         srvreq_f.X_SR_ATTR_WID = srvreq_md.ROW_WID and 
         day_d.ROW_WID >= 20110101.0 and
         day_d.ROW_WID < 20150101.0
        ) 
    )
    select BBL,
        BIN,
        agency, 
        type_cd_i, 
        area, 
        sub_area, 
        count(sr_num) as count, 
        year_dt 
    from complaints
    group by BBL, BIN, agency, type_cd_i,area,sub_area, year_dt
    '''

    complaints = moda.databridge(query)
    print '311 initial pull',complaints.shape

    #convert to float and drop non positive BBLs
    complaints.BBL = pd.to_numeric(complaints.BBL,errors='coersive')
    complaints = complaints[complaints.BBL>0]
    print '311 after dropping non positive BBLs', complaints.shape

    complaints.BIN = pd.to_numeric(complaints.BIN,errors='coersive')
    complaints.fillna('-',inplace=True)

    return complaints
Ejemplo n.º 5
0
def readLinked():
    ''' read in linked(matched) DMV/SPARKS data'''

    linked = moda.databridge('select * from anaphi.dohmh_traffic_dot_moda2', encrypted='yes')

    # some col names end with underscore, removing them.
    linked.columns = [x[:-1] if x[-1]=='_' else x for x in linked.columns ]

    print 'linked',linked.shape

    # dropping anything without a police report
    linked.drop(linked[linked.POL_REPT=='N'].index,inplace=True)
    # dropping anything with injury type = 15-18 since that corresponds to the civilian form
    linked.drop(index = linked[linked.INJT_ID.isin(['15','16','17','18'])].index,
            inplace=True)

    linked.reset_index(drop=True,inplace=True)

    print 'linked after dropping no police reports', linked.shape
    return linked
Ejemplo n.º 6
0
def gatherDOBcomp():
    ''' DOB complaints 2011 to 2014 '''
    query = '''
    select *
    from anafic.wc_dob_complaints_f
    where DATE_ENTERED >= to_date('2011-01-01','YYYY-MM-DD')
    and DATE_ENTERED < to_date('2015-01-01','YYYY-MM-DD')
    '''

    dob = moda.databridge(query)
    print 'dob complaints',dob.shape

    # extract the year of the complaint
    dob['DOBYear'] = dob['DATE_ENTERED'].apply(lambda x: x.year) 
    
    # change to date format
    dob['INSPECTION_DATE'] = pd.to_datetime(dob['INSPECTION_DATE'],errors='coerce')

    #combine to get BBL
    dob['BBL']=dob.BORO+dob.BLOCK+dob.LOT

    #convert BIN,BBL to numbers
    dob['BBL'] = pd.to_numeric(dob.BBL,errors='coersive')
    dob['BIN'] = pd.to_numeric(dob.BIN,errors='coersive')

    # illegal work complaints
    dobIllWorkCat = ['05','83','86','66','71','90','12','5G','76','5A','3A']
    dobIllWork = dob[dob['COMPLAINT_CATEGORY'].isin(dobIllWorkCat)]
    dobIllWork.groupby(['COMPLAINT_CATEGORY','COMPLAINT_CATEGORY_DESCRIPTION']).count()[['BBL']]

    # drop complaints where the last disposition code states no violation was found
    dropcodes = ['I2','XX','I1','H1']
    dobIWC = dobIllWork[~dobIllWork.LAST_DISPOSITION_CODE.isin(dropcodes)]
    
    print 'dob illegal work',dobIWC.shape
    return dobIWC