from src.models.db_config import DbConfig import src.lib.db_utils as DbUtils if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) db_config = DbConfig(db_name="healthcare2", db_table="public.neighborhood") s3_input_file_location = "s3a://data2insights/zipcode/parquet/scored_neighborhood_master" cols = [ F.col("zip_code").alias("zip_code"), F.col("state").alias("state"), F.col("latitude").alias("latitude"), F.col("longitude").alias("longitude"), F.col("county_fips").alias("county_fips"), F.col("county_name").alias("county_name"), F.col("provider_count").alias("provider_count"), F.col("institution_count").alias("institution_count"), F.col("medicare_count").alias("medicare_count"), F.col("score").alias("score") ] scored_neighborhood_df = sqlContext.read.parquet( s3_input_file_location).select(cols) DbUtils.insert(db_config=db_config, dataframe=scored_neighborhood_df) spark.stop()
if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) db_config = DbConfig(db_name="healthcare2", db_table="public.provider") s3_input_file_location = "s3a://data2insights/provider/parquet/scored_provider_master" cols = [ F.col("npi").alias("npi"), F.col("last_name").alias("last_name"), F.col("first_name").alias("first_name"), F.col("middle_name").alias("middle_name"), F.col("suffix").alias("suffix"), F.col("credentials").alias("credentials"), F.col("gender").alias("gender"), F.col("specialty").alias("specialty"), F.col("medicare_count").alias("medicare_count"), F.col("score").alias("score"), F.col("zip_code").alias("zip_code") ] scored_hcp_df = sqlContext.read.parquet(s3_input_file_location).select( cols) DbUtils.insert(db_config=db_config, dataframe=scored_hcp_df) spark.stop()
from pyspark.sql import SparkSession, SQLContext from src.models.db_config import DbConfig import src.lib.db_utils as DbUtils if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) db_config = DbConfig(db_name="healthcare", db_table="public.npi_hco") s3_input_file_location = ( "s3a://data2insights/NPPES/parquet/npi_hco" ) limit = 10000000 cols = ["npi", "entity_type_code", "provider_organization_name_legal_business_name", "provider_first_line_business_mailing_address" , "provider_second_line_business_mailing_address", "provider_business_mailing_address_city_name", "provider_business_mailing_address_state_name", "zip5"] df = sqlContext.read.parquet(s3_input_file_location).select(cols).limit(limit) DbUtils.insert(db_config=db_config, dataframe=df) spark.stop()
from pyspark.sql import SparkSession, SQLContext from src.models.db_config import DbConfig import src.lib.db_utils as DbUtils import pyspark.sql.functions as F if __name__ == "__main__": spark = SparkSession.builder.appName("data2insights").getOrCreate() sqlContext = SQLContext(spark) db_config = DbConfig(db_name="healthcare2", db_table="public.institution") s3_input_file_location = "s3a://data2insights/institution/parquet/scored_institution_master" cols = [ F.col("npi").alias("npi"), F.col("name").alias("name"), F.col("medicare_count").alias("medicare_count"), F.col("score").alias("score"), F.col("zip_code").alias("zip_code") ] scored_institution_df = sqlContext.read.parquet( s3_input_file_location).select(cols) DbUtils.insert(db_config=db_config, dataframe=scored_institution_df) spark.stop()