Example #1
0
from pyspark.sql import SparkSession, SQLContext
import src.lib.utils as Utils
import pyspark.sql.functions as F

if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()

    sqlContext = SQLContext(spark)

    zipcode_s3_input_file_location = "s3a://data2insights/Medicare/parquet/medicare_clean_zipcode"
    zipcode_s3_output_file_location = "s3a://data2insights/zipcode/parquet/medicare_count_by_zipcode"

    medicare_df = sqlContext.read.parquet(zipcode_s3_input_file_location)

    zipcode_counts_df = (medicare_df.groupBy("zip5").agg(
        F.count(F.lit(1)).alias("zipcode_medicare_count")))

    Utils.write_df_to_s3(zipcode_counts_df, zipcode_s3_output_file_location)

    spark.stop()
from pyspark.sql import SparkSession, SQLContext
import src.lib.utils as Utils
import pyspark.sql.functions as F

if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()

    sqlContext = SQLContext(spark)

    hcp_s3_input_file_location = "s3a://data2insights/Medicare/parquet/medicare_hcp"
    hco_s3_input_file_location = "s3a://data2insights/Medicare/parquet/medicare_hco"

    hcp_s3_output_file_location = "s3a://data2insights/provider/parquet/medicare_count_by_hcp"
    hco_s3_output_file_location = "s3a://data2insights/institution/parquet/medicare_count_by_hco"

    medicare_hcp_df = sqlContext.read.parquet(hcp_s3_input_file_location)
    medicare_hco_df = sqlContext.read.parquet(hco_s3_input_file_location)

    hcp_counts_df = (medicare_hcp_df.groupBy("npi").agg(
        F.count(F.lit(1)).alias("hcp_medicare_count")))

    hco_counts_df = (medicare_hco_df.groupBy("npi").agg(
        F.count(F.lit(1)).alias("hco_medicare_count")))

    Utils.write_df_to_s3(hcp_counts_df, hcp_s3_output_file_location)
    Utils.write_df_to_s3(hco_counts_df, hco_s3_output_file_location)

    spark.stop()
import pyspark.sql.functions as F

if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()
    sqlContext = SQLContext(spark)

    npi_hco_s3_input_file_location = "s3a://data2insights/NPPES/parquet/npi_hco"
    medicare_hco_s3_input_file_location = "s3a://data2insights/institution/parquet/medicare_count_by_hco"

    s3_output_file_location = "s3a://data2insights/institution/parquet/combined_institution_master"

    limit = 10000000
    cols = [
        F.col("npi").alias("npi"),
        F.col("provider_organization_name_legal_business_name").alias("name"),
        F.col("hco_medicare_count").alias("medicare_count"),
        F.col("zip5").alias("zip_code")
    ]

    npi_hco_df = sqlContext.read.parquet(npi_hco_s3_input_file_location)
    medicare_hco_df = sqlContext.read.parquet(
        medicare_hco_s3_input_file_location)

    institution_df = npi_hco_df.join(medicare_hco_df, ["npi"],
                                     how="left").select(cols).limit(limit)

    Utils.write_df_to_s3(institution_df, s3_output_file_location)

    spark.stop()
Example #4
0
    spark = SparkSession.builder.appName("data2insights").getOrCreate()
    sqlContext = SQLContext(spark)

    npi_hcp_s3_input_file_location = "s3a://data2insights/NPPES/parquet/npi_hcp"
    medicare_hcp_s3_input_file_location = "s3a://data2insights/provider/parquet/medicare_count_by_hcp"

    s3_output_file_location = "s3a://data2insights/provider/parquet/combined_provider_master"

    limit = 10000000
    cols = [
        F.col("npi").alias("npi"),
        F.col("provider_last_name_legal_name").alias("last_name"),
        F.col("provider_first_name").alias("first_name"),
        F.col("provider_middle_name").alias("middle_name"),
        F.col("provider_name_suffix_text").alias("suffix"),
        F.col("provider_credential_text").alias("credentials"),
        F.col("provider_gender_code").alias("gender"),
        F.col("healthcare_provider_taxonomy_code_1").alias("specialty"),
        F.col("hcp_medicare_count").alias("medicare_count"),
        F.col("zip5").alias("zip_code")
    ]

    npi_hcp_df = sqlContext.read.parquet(npi_hcp_s3_input_file_location)
    medicare_hcp_df = sqlContext.read.parquet(medicare_hcp_s3_input_file_location)

    provider_df = npi_hcp_df.join(medicare_hcp_df, ["npi"], how="left").select(cols).limit(limit)

    Utils.write_df_to_s3(provider_df, s3_output_file_location)

    spark.stop()
Example #5
0
from pyspark.sql import SparkSession, SQLContext
import src.lib.utils as Utils

if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()

    sqlContext = SQLContext(spark)

    s3_input_file_location = "s3a://data2insights/NPPES/parquet/npi"
    s3_output_file_location = "s3a://data2insights/NPPES/parquet/npi_subset_10000"

    medicare_df = sqlContext.read.parquet(s3_input_file_location)
    medicare_subset_df = medicare_df.limit(10000)
    Utils.write_df_to_s3(medicare_subset_df, s3_output_file_location)

    spark.stop()
from pyspark.sql import SparkSession, SQLContext
import src.lib.utils as Utils
from pyspark.sql.types import *


if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()

    sqlContext = SQLContext(spark)

    s3_input_file_location = "s3a://data2insights/Medicare/parquet/medicare_clean_zipcode"

    s3_hcp_output_file_location = "s3a://data2insights/Medicare/parquet/medicare_hcp"
    s3_hco_output_file_location = "s3a://data2insights/Medicare/parquet/medicare_hco"

    medicare_df = sqlContext.read.parquet(s3_input_file_location)

    medicare_hcp_df = medicare_df.where(medicare_df.nppes_entity_code == 'I').coalesce(8)
    medicare_hco_df = medicare_df.where(medicare_df.nppes_entity_code == 'O').coalesce(8)

    Utils.write_df_to_s3(medicare_hcp_df, s3_hcp_output_file_location)
    Utils.write_df_to_s3(medicare_hco_df, s3_hco_output_file_location)

    spark.stop()
Example #7
0
from pyspark.sql import SparkSession, SQLContext
import src.lib.utils as Utils
from pyspark.sql.types import *

if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()

    sqlContext = SQLContext(spark)

    s3_input_file_location = "s3a://data2insights/NPPES/parquet/npi_clean_zipcode"

    s3_hcp_output_file_location = "s3a://data2insights/NPPES/parquet/npi_hcp"
    s3_hco_output_file_location = "s3a://data2insights/NPPES/parquet/npi_hco"

    npi_subset_df = sqlContext.read.parquet(s3_input_file_location)

    npi_hcp_subset_df = npi_subset_df.where(
        npi_subset_df.entity_type_code == 1).coalesce(8)
    npi_hco_subset_df = npi_subset_df.where(
        npi_subset_df.entity_type_code == 2).coalesce(8)

    Utils.write_df_to_s3(npi_hcp_subset_df, s3_hcp_output_file_location)
    Utils.write_df_to_s3(npi_hco_subset_df, s3_hco_output_file_location)

    spark.stop()
    df_uszips = sqlContext.read.parquet(
        s3_uszips_file_location).withColumnRenamed("zip", "zip5")

    df_hcp = df_npi_counts.where(df_npi_counts.entity_type_code == 1).select(
        F.col("zip5"),
        F.col("count").alias("hcp_count"))
    df_hco = df_npi_counts.where(df_npi_counts.entity_type_code == 2).select(
        F.col("zip5"),
        F.col("count").alias("hco_count"))

    df_zip_counts = df_hcp.join(df_hco, ["zip5"], how="full")

    cols = [
        F.col("zip5").alias("zip_code"),
        F.col("state_id").alias("state"),
        F.col("lat").alias("latitude"),
        F.col("lng").alias("longitude"),
        F.col("county_fips").alias("county_fips"),
        F.col("county_name").alias("county_name"),
        F.col("hcp_count").alias("provider_count"),
        F.col("hco_count").alias("institution_count"),
        F.col("zipcode_medicare_count").alias("medicare_count")
    ]

    df_neighborhood = df_zip_counts.join(df_uszips, ["zip5"], how="full").join(
        df_medicare_counts, ["zip5"], how="full").select(cols)

    Utils.write_df_to_s3(df_neighborhood, s3_output_file_location)

    spark.stop()
Example #9
0
from pyspark.sql import SparkSession, SQLContext
import src.lib.utils as Utils

if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()

    sqlContext = SQLContext(spark)

    s3_input_file_location = "s3a://data2insights/NPPES/parquet/npi_subset_10000"
    s3_output_file_location = (
        "s3a://data2insights/NPPES/parquet/npi_subset_clean_zipcode")

    npi_subset_df = sqlContext.read.parquet(s3_input_file_location)
    npi_subset_clean_zipcode_df = Utils.add_zip5_col(
        npi_subset_df,
        "provider_business_practice_location_address_postal_code")
    Utils.write_df_to_s3(npi_subset_clean_zipcode_df, s3_output_file_location)

    spark.stop()
Example #10
0
from pyspark.sql import SparkSession, SQLContext
import src.lib.utils as Utils

if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()

    sqlContext = SQLContext(spark)

    s3_input_file_location = "s3a://data2insights/Census/parquet/2010census"
    s3_output_file_location = (
        "s3a://data2insights/Census/parquet/2010census_clean_zipcode")

    census_2010_df = sqlContext.read.parquet(s3_input_file_location)
    census_2010_clean_zipcode_df = Utils.add_zip5_col(census_2010_df)
    Utils.write_df_to_s3(census_2010_clean_zipcode_df, s3_output_file_location)

    spark.stop()