Ejemplo n.º 1
0
def main():
    """
    Indicators for DOC sentences by crime type
    """

    sql = """
          SELECT DISTINCT offense_type AS offense
            FROM {doc_sentences}
           WHERE offense_type IS NOT NULL
        ORDER BY offense
          """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        offenses = pd.read_sql(sql, cxn)

    print(offenses)
    pivot = ",".join([
        "'{0}' AS doc_sentenced_{0}".format(offense)
        for offense in offenses["OFFENSE"]
    ])

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT * 
            FROM (
                  SELECT pop.riipl_id,
                         ds.offense_type AS offense,
                         1 AS sentenced
                    FROM {population} pop
               LEFT JOIN {lookback} lb
                      ON pop.riipl_id = lb.riipl_id
               LEFT JOIN {dim_date} dd
                      ON lb.yrmo = dd.yrmo
              INNER JOIN {doc_sentences} ds
                      ON pop.riipl_id = ds.riipl_id AND 
                         dd.date_dt = ds.imposed_date
                GROUP BY pop.riipl_id, ds.offense_type
                 ) 
           PIVOT (MAX(sentenced) FOR offense IN ({pivot}))
          """.format(pivot=pivot, **globals())

    with cx_Oracle.connect("/") as cxn:
        features = features.join(pd.read_sql(sql, cxn).set_index(index))

    # Drop offense types that are missing for the entire population.
    features = features.dropna(axis=1, how="all").fillna(0).astype(int)

    labels = {}
    for offense in offenses["OFFENSE"]:
        label = "sentenced for {} during lookback period".format(offense)
        labels["DOC_SENTENCED_{}".format(offense.upper())] = label

    SaveFeatures(features,
                 out,
                 manifest,
                 population,
                 labels,
                 bool_features=list(labels))
Ejemplo n.º 2
0
def main():
    """
    Industry worked in (NAICS code) from DLT wages
    """

    sql = """
          SELECT DISTINCT SUBSTR(naics4, 0, 2) AS naics2
            FROM {dlt_wage}
           ORDER BY naics2
          """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        naics = pd.read_sql(sql, cxn)

    pivot = ",".join(["'{0}' AS naics_{0}".format(n) for n in naics["NAICS2"]])

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT *
            FROM (   
                  SELECT pop.riipl_id,
                         SUBSTR(dw.naics4, 0, 2) AS naics2,
                         1 AS worked
                    FROM {population} pop
               LEFT JOIN (
                          SELECT DISTINCT
                                 riipl_id,
                                 yyq
                            FROM {lookback}
                         ) lb
                      ON pop.riipl_id = lb.riipl_id
              INNER JOIN {dlt_wage} dw
                      ON pop.riipl_id = dw.riipl_id AND
                         lb.yyq = dw.yyq
                GROUP BY pop.riipl_id, SUBSTR(dw.naics4, 0, 2)
                 ) 
           PIVOT (MAX(worked) FOR naics2 IN ({pivot}))
         """.format(pivot=pivot, **globals())

    with cx_Oracle.connect("/") as cxn:
        features = features.join(pd.read_sql(sql, cxn).set_index(index))

    # Drop NAICS categories that are missing for the entire population.
    features = features.dropna(axis=1, how="all").fillna(0).astype(int)

    # Missing indicator not needed - provided by WAGES_MISSING

    labels = {}
    naics2_labels = {
        "00": "Unknown",
        "02": "Unknown",
        "11": "Agriculture, Forestry, Fishing, and Hunting",
        "21": "Mining, Quarrying, and Oil and Gas Extraction",
        "22": "Utilities",
        "23": "Construction",
        "24": "Unknown",
        "31": "Manufacturing",
        "32": "Manufacturing",
        "33": "Manufacturing",
        "42": "Wholesale Trade",
        "44": "Retail Trade",
        "45": "Retail Trade",
        "48": "Transportation and Warehousing",
        "49": "Transportation and Warehousing",
        "50": "Unknown",
        "51": "Information",
        "52": "Finance and Insurance",
        "53": "Real Estate and Rental and Leasing",
        "54": "Professional, Scientific, and Technical Services",
        "55": "Management of Companies and Enterprises",
        "56":
        "Administrative and Support and Waste Management and Remediation Services",
        "61": "Educational Services",
        "62": "Health Care and Social Assistance",
        "71": "Arts, Entertainment, and Recreation",
        "72": "Accommodation and Food Services",
        "81": "Other Services (except Public Administration)",
        "84": "Unknown",
        "92": "Public Administration",
        "94": "Unknown",
        "99": "Unknown"
    }
    for n, sector in naics2_labels.items():
        label = "worked in {} sector (NAICS {}) during lookback period".format(
            sector, n)
        labels["NAICS_{}".format(n)] = label

    SaveFeatures(features,
                 out,
                 manifest, (population, index),
                 labels,
                 bool_features=list(labels))