Esempio n. 1
0
def main():
    sc = SparkContext( appName="Transforming Eff Care" )
    src_effcare = sc.textFile(utils.data_home + "/effective_care.csv")
    src_readmission = sc.textFile(utils.data_home + "/readmissions.csv")

    transform1 = partial(transform_row,
                        orig_headers=eff_care_headers,
                        new_headers=utils.map_headers(proc_headers, eff_care_headers))
    transform2 = partial(transform_row,
                        orig_headers=readmission_headers,
                        new_headers=utils.map_headers(proc_headers, readmission_headers))


    transformed_eff = src_effcare.map(utils.to_row_sep).map(transform1).map(utils.to_row_string)
    transformed_readmission = src_readmission.map(utils.to_row_sep).map(transform2).map(utils.to_row_string)

    (transformed_eff + transformed_readmission) \
        .saveAsTextFile(utils.data_home + "/procedures_data")
Esempio n. 2
0
def transform_row(line_parts):
    headers = utils.map_headers(new_headers, orig_headers)
    new_parts = []
    for (key, index), transform in zip(headers.iteritems(), transform_fns):
        if index > -1:
            value = line_parts[index]
            value = transform(value)
        else:
            value = utils.NULL_FMT

        new_parts.append(value)

    return new_parts