from pyspark.sql import SparkSession, SQLContext import src.lib.utils as Utils import pyspark.sql.functions as F if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) zipcode_s3_input_file_location = "s3a://data2insights/Medicare/parquet/medicare_clean_zipcode" zipcode_s3_output_file_location = "s3a://data2insights/zipcode/parquet/medicare_count_by_zipcode" medicare_df = sqlContext.read.parquet(zipcode_s3_input_file_location) zipcode_counts_df = (medicare_df.groupBy("zip5").agg( F.count(F.lit(1)).alias("zipcode_medicare_count"))) Utils.write_df_to_s3(zipcode_counts_df, zipcode_s3_output_file_location) spark.stop()
from pyspark.sql import SparkSession, SQLContext import src.lib.utils as Utils import pyspark.sql.functions as F if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) hcp_s3_input_file_location = "s3a://data2insights/Medicare/parquet/medicare_hcp" hco_s3_input_file_location = "s3a://data2insights/Medicare/parquet/medicare_hco" hcp_s3_output_file_location = "s3a://data2insights/provider/parquet/medicare_count_by_hcp" hco_s3_output_file_location = "s3a://data2insights/institution/parquet/medicare_count_by_hco" medicare_hcp_df = sqlContext.read.parquet(hcp_s3_input_file_location) medicare_hco_df = sqlContext.read.parquet(hco_s3_input_file_location) hcp_counts_df = (medicare_hcp_df.groupBy("npi").agg( F.count(F.lit(1)).alias("hcp_medicare_count"))) hco_counts_df = (medicare_hco_df.groupBy("npi").agg( F.count(F.lit(1)).alias("hco_medicare_count"))) Utils.write_df_to_s3(hcp_counts_df, hcp_s3_output_file_location) Utils.write_df_to_s3(hco_counts_df, hco_s3_output_file_location) spark.stop()
import pyspark.sql.functions as F if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) npi_hco_s3_input_file_location = "s3a://data2insights/NPPES/parquet/npi_hco" medicare_hco_s3_input_file_location = "s3a://data2insights/institution/parquet/medicare_count_by_hco" s3_output_file_location = "s3a://data2insights/institution/parquet/combined_institution_master" limit = 10000000 cols = [ F.col("npi").alias("npi"), F.col("provider_organization_name_legal_business_name").alias("name"), F.col("hco_medicare_count").alias("medicare_count"), F.col("zip5").alias("zip_code") ] npi_hco_df = sqlContext.read.parquet(npi_hco_s3_input_file_location) medicare_hco_df = sqlContext.read.parquet( medicare_hco_s3_input_file_location) institution_df = npi_hco_df.join(medicare_hco_df, ["npi"], how="left").select(cols).limit(limit) Utils.write_df_to_s3(institution_df, s3_output_file_location) spark.stop()
spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) npi_hcp_s3_input_file_location = "s3a://data2insights/NPPES/parquet/npi_hcp" medicare_hcp_s3_input_file_location = "s3a://data2insights/provider/parquet/medicare_count_by_hcp" s3_output_file_location = "s3a://data2insights/provider/parquet/combined_provider_master" limit = 10000000 cols = [ F.col("npi").alias("npi"), F.col("provider_last_name_legal_name").alias("last_name"), F.col("provider_first_name").alias("first_name"), F.col("provider_middle_name").alias("middle_name"), F.col("provider_name_suffix_text").alias("suffix"), F.col("provider_credential_text").alias("credentials"), F.col("provider_gender_code").alias("gender"), F.col("healthcare_provider_taxonomy_code_1").alias("specialty"), F.col("hcp_medicare_count").alias("medicare_count"), F.col("zip5").alias("zip_code") ] npi_hcp_df = sqlContext.read.parquet(npi_hcp_s3_input_file_location) medicare_hcp_df = sqlContext.read.parquet(medicare_hcp_s3_input_file_location) provider_df = npi_hcp_df.join(medicare_hcp_df, ["npi"], how="left").select(cols).limit(limit) Utils.write_df_to_s3(provider_df, s3_output_file_location) spark.stop()
from pyspark.sql import SparkSession, SQLContext import src.lib.utils as Utils if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) s3_input_file_location = "s3a://data2insights/NPPES/parquet/npi" s3_output_file_location = "s3a://data2insights/NPPES/parquet/npi_subset_10000" medicare_df = sqlContext.read.parquet(s3_input_file_location) medicare_subset_df = medicare_df.limit(10000) Utils.write_df_to_s3(medicare_subset_df, s3_output_file_location) spark.stop()
from pyspark.sql import SparkSession, SQLContext import src.lib.utils as Utils from pyspark.sql.types import * if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) s3_input_file_location = "s3a://data2insights/Medicare/parquet/medicare_clean_zipcode" s3_hcp_output_file_location = "s3a://data2insights/Medicare/parquet/medicare_hcp" s3_hco_output_file_location = "s3a://data2insights/Medicare/parquet/medicare_hco" medicare_df = sqlContext.read.parquet(s3_input_file_location) medicare_hcp_df = medicare_df.where(medicare_df.nppes_entity_code == 'I').coalesce(8) medicare_hco_df = medicare_df.where(medicare_df.nppes_entity_code == 'O').coalesce(8) Utils.write_df_to_s3(medicare_hcp_df, s3_hcp_output_file_location) Utils.write_df_to_s3(medicare_hco_df, s3_hco_output_file_location) spark.stop()
from pyspark.sql import SparkSession, SQLContext import src.lib.utils as Utils from pyspark.sql.types import * if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) s3_input_file_location = "s3a://data2insights/NPPES/parquet/npi_clean_zipcode" s3_hcp_output_file_location = "s3a://data2insights/NPPES/parquet/npi_hcp" s3_hco_output_file_location = "s3a://data2insights/NPPES/parquet/npi_hco" npi_subset_df = sqlContext.read.parquet(s3_input_file_location) npi_hcp_subset_df = npi_subset_df.where( npi_subset_df.entity_type_code == 1).coalesce(8) npi_hco_subset_df = npi_subset_df.where( npi_subset_df.entity_type_code == 2).coalesce(8) Utils.write_df_to_s3(npi_hcp_subset_df, s3_hcp_output_file_location) Utils.write_df_to_s3(npi_hco_subset_df, s3_hco_output_file_location) spark.stop()
df_uszips = sqlContext.read.parquet( s3_uszips_file_location).withColumnRenamed("zip", "zip5") df_hcp = df_npi_counts.where(df_npi_counts.entity_type_code == 1).select( F.col("zip5"), F.col("count").alias("hcp_count")) df_hco = df_npi_counts.where(df_npi_counts.entity_type_code == 2).select( F.col("zip5"), F.col("count").alias("hco_count")) df_zip_counts = df_hcp.join(df_hco, ["zip5"], how="full") cols = [ F.col("zip5").alias("zip_code"), F.col("state_id").alias("state"), F.col("lat").alias("latitude"), F.col("lng").alias("longitude"), F.col("county_fips").alias("county_fips"), F.col("county_name").alias("county_name"), F.col("hcp_count").alias("provider_count"), F.col("hco_count").alias("institution_count"), F.col("zipcode_medicare_count").alias("medicare_count") ] df_neighborhood = df_zip_counts.join(df_uszips, ["zip5"], how="full").join( df_medicare_counts, ["zip5"], how="full").select(cols) Utils.write_df_to_s3(df_neighborhood, s3_output_file_location) spark.stop()
from pyspark.sql import SparkSession, SQLContext import src.lib.utils as Utils if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) s3_input_file_location = "s3a://data2insights/NPPES/parquet/npi_subset_10000" s3_output_file_location = ( "s3a://data2insights/NPPES/parquet/npi_subset_clean_zipcode") npi_subset_df = sqlContext.read.parquet(s3_input_file_location) npi_subset_clean_zipcode_df = Utils.add_zip5_col( npi_subset_df, "provider_business_practice_location_address_postal_code") Utils.write_df_to_s3(npi_subset_clean_zipcode_df, s3_output_file_location) spark.stop()
from pyspark.sql import SparkSession, SQLContext import src.lib.utils as Utils if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) s3_input_file_location = "s3a://data2insights/Census/parquet/2010census" s3_output_file_location = ( "s3a://data2insights/Census/parquet/2010census_clean_zipcode") census_2010_df = sqlContext.read.parquet(s3_input_file_location) census_2010_clean_zipcode_df = Utils.add_zip5_col(census_2010_df) Utils.write_df_to_s3(census_2010_clean_zipcode_df, s3_output_file_location) spark.stop()