def table_schema_from_spark(hcat_table_name): #returns schema of table with this database.name in hcatalog # (spark-workaround as long as hcatweb api is not available...) # initialize spark import findspark findspark.init() import pyspark from pyspark.sql import HiveContext sc_conf = pyspark.SparkConf() #sc_conf.set('spark.executor.extraClassPath','/opt/cloudera/parcels/CDH/lib/hive/lib/*') #sc_conf.set('spark.master','yarn-client') sc = pyspark.SparkContext(appName = 'ade_get_table_schema', conf=sc_conf) hc = HiveContext(sc) hive_schema = hc.table(hcat_table_name).schema.jsonValue() print hive_schema sc.stop() table_schema = {'columns':{}} col_sequence = 0 for field in hive_schema['fields']: table_schema['columns'][field['name']] = {'col_sequence': col_sequence, 'type':field['type']} col_sequence += 1 return table_schema
def create_dataframe_from_hive(spark_session, dbConnectionParams): df = None try: sc = SparkSession.builder.appName("Testing").config( conf=SparkConf()).enableHiveSupport().getOrCreate() sqlContext = HiveContext(sc) sqlContext.setConf( "hive.metastore.uris", "thrift://{}:{}".format(dbConnectionParams.get("host"), dbConnectionParams.get("port"))) tdf = sqlContext.sql("show databases") tdf.show() schema = DataLoader.get_db_name(dbConnectionParams) table_name = dbConnectionParams.get("tablename") df = sqlContext.table(".".join([schema, table_name])) except Exception as e: print("couldn't connect to hive") raise e return df
def filtering(alpha_thr): if __name__ == "__main__": sc = SparkContext(appName="Edges Filtering Hive v1") hc = HiveContext(sparkContext=sc) tbl = hc.table("mi2mi.edges") tbl.registerTempTable("edges") edgesDF = hc.sql("select SID1, SID2, EdgeCost from edges where MilanoDate='2013-11-01'") edgesDF.show() v = hc.sql("select distinct SID1 id from edges where MilanoDate='2013-11-01' order by SID1") e = hc.sql("select SID1 src, SID2 dst, EdgeCost cost from edges where MilanoDate='2013-11-01'") d = hc.sql("select sid1, sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate='2013-11-01' group by sid1") g = GraphFrame(v, e) g.vertices.show() g.edges.show() d = hc.sql("select sid1, sid2, 1 - (select count(distinct sid1) - 2 from edges where MilanoDate=e.MilanoDate)*(EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1) + pow(1 - EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1), (select count(distinct sid1) - 1 from edges where MilanoDate=e.MilanoDate))) /((select count(distinct sid1) - 1 from edges where MilanoDate=e.MilanoDate) * (EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1) - 1)) alpha from edges e where MilanoDate='2013-11-01'") d.show()
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--output_path', help='output path in s3 including bucket without file name') args = parser.parse_args() if args.output_path: output_path = args.output_path else: raise ValueError('missing argument - output_path') model_name = 'photography' spark_config = SparkConf().setAppName(model_name) dependencies = get_dependencies() spark_context = SparkContext(conf=spark_config, pyFiles=dependencies) hive_context = HiveContext(spark_context) is_photo_related = F.udf( lambda s: True if ('camera' in s) or ('video' in s) else False, types.BooleanType()) get_event_score = F.udf(score_single_event, types.FloatType()) # received notification at least as many as viewed fix_received = F.udf(lambda received, view: max(received, view), types.FloatType()) # TODO: switch to l1 home_events_uuid events = hive_context.table('l2_sprint.mixpanel_home') # choose photography related content interactions from notifications # (devicefeatures_attribute exists only for notification items, not future cards) # relevant content started approximately '2017-10-31' content_items = events \ .filter(events['event'].isin( [x.lower() for x in ['ContentItem_Received', 'ContentItem_View', 'ContentItem_Click', 'ContentItem_TimeOnPage', 'ContentItem_PageScroll']]) ) \ .filter(events['Time'.lower()] > '2017-10-31') \ .filter(events['CarrierName'.lower()].isin('sprint', 'verizon')) \ .filter(F.get_json_object( events['properties'], "$['properties']['IsTest']".lower()) == 'false') \ .filter(F.get_json_object( events['properties'], "$['properties']['DeviceId']".lower()).isNotNull()) \ .filter(F.get_json_object( events['properties'], "$['properties']['MessageType']".lower()).isNotNull()) \ .filter(F.get_json_object( events['properties'], "$['properties']['devicefeatures_attribute']").isNotNull()) \ .filter(is_photo_related(F.get_json_object( events['properties'], "$['properties']['devicefeatures_attribute']"))) # assign score for each interactions content_items = content_items \ .withColumn( 'score', get_event_score( events['event'], F.get_json_object(events['properties'], "$['properties']")) ) # aggregate score per user, item, event, action (action to differentiate clicks). # use max on properties for score because page scroll sends intermediate states for example. # use max on properties in case it's null or empty string in one of the events content_items = content_items \ .groupBy( F.get_json_object( events['properties'], "$['properties']['DeviceId']".lower()).alias('device_id'), events['event'], F.get_json_object( events['properties'], "$['properties']['MessageType']".lower()).alias('topic'), F.get_json_object( events['properties'], "$['properties']['ActionId']".lower()).alias('action') ) \ .agg( F.max(F.get_json_object( events['properties'], "$['properties']['AtlasUniqueUserId']".lower())).alias('user_id'), F.max('CarrierName'.lower()).alias('carrier_name'), F.max('DeviceModel'.lower()).alias('device_model'), F.max('DeviceModelName'.lower()).alias('device_model_name'), F.max('DeviceOsType'.lower()).alias('device_os_type'), F.max('DeviceVendor'.lower()).alias('device_vendor'), F.max('score').alias('score') ) # FIXME fix view according action events received_content_items = content_items \ .groupBy('device_id') \ .pivot('event', ['ContentItem_Received'.lower(), 'ContentItem_View'.lower()]).sum('score') \ .fillna(0.0) \ .select( 'device_id', fix_received(F.col('contentitem_received'), F.col('contentitem_view')).alias('receive')) # calculate final score for user. content_items = content_items \ .filter(events['event'] != 'ContentItem_Received'.lower()) \ .groupBy('device_id') \ .agg( F.max('user_id').alias('user_id'), F.max('carrier_name').alias('carrier_name'), F.max('device_model').alias('device_model'), F.max('device_model_name').alias('device_model_name'), F.max('device_os_type').alias('device_os_type'), F.max('device_vendor').alias('device_vendor'), F.sum('score').alias('total_score') ) \ .join(received_content_items, 'device_id', 'left') \ .withColumn('score', F.round(F.col('total_score') / F.col('receive'))) \ .drop('total_score', 'receive') \ .withColumn('photography_interest', F.lit(None)) # choose users who completed user interest questionnaire interests = events \ .filter(events['event'] == 'Timeline_OnboardingMessage_Click'.lower()) \ .filter(events['CarrierName'.lower()].isin('sprint', 'verizon')) \ .filter(F.get_json_object( events['properties'], "$['properties']['IsTest']".lower()) == 'false') \ .filter(F.get_json_object( events['properties'], "$['properties']['DeviceId']".lower()).isNotNull()) \ .filter(F.get_json_object( events['properties'], "$['properties']['ActionId']".lower()) == 'done') # assign score for photography interest interests = interests \ .withColumn( 'score', get_event_score( events['event'], F.get_json_object(events['properties'], "$['properties']")) ) # subset relevant properties and drop duplicated devices # (assuming each user should answer questionnaire ones) interests = interests \ .select( F.get_json_object( events['properties'], "$['properties']['DeviceId']".lower()).alias('device_id'), F.get_json_object( events['properties'], "$['properties']['AtlasUniqueUserId']".lower()).alias('user_id'), events['CarrierName'.lower()].alias('carrier_name'), events['DeviceModel'.lower()].alias('device_model'), events['DeviceModelName'.lower()].alias('device_model_name'), events['DeviceOsType'.lower()].alias('device_os_type'), events['DeviceVendor'.lower()].alias('device_vendor'), 'score' ) \ .drop_duplicates(['device_id']) \ .withColumn('photography_interest', F.when(F.col('score') > 0, 1.0).otherwise(0.0)) # assregate content and interest scores # use max on properties in case it's null or empty string in one of the events photography_user = content_items.union(interests) \ .groupBy('device_id') \ .agg( F.max('user_id').alias('user_id'), F.max('carrier_name').alias('carrier_name'), F.max('device_model').alias('device_model'), F.max('device_model_name').alias('device_model_name'), F.max('device_os_type').alias('device_os_type'), F.max('device_vendor').alias('device_vendor'), F.sum('score').alias('score'), F.max('photography_interest').alias('photography_interest') ) dgx = hive_context.table('l2_asurion.demographics_dbo_source_dgx') mobileid = hive_context.table('l3_sprint.mobileid') # FIXME: decrypt ethnicityrollup, dob, ethnicity photography_user_augmented = photography_user \ .join(mobileid.select('mobileuid', 'subid'), photography_user['user_id'] == mobileid['mobileuid'], 'left') \ .join(dgx.select('source_dfx_id', 'nameprefix', 'state', 'age_range', 'income_range_vds', 'gender', 'marital_status', 'dwelling_type', 'home_ownership', 'length_of_residence', 'presence_of_children', 'mail_public_responder_indicator', 'mail_responsive_buyer_indicator', 'home_value_range', 'networthindicator_rollup', 'wealth_decile', 'homeandlandvalue', 'first_mortgage_amount', 'level_of_education', 'head_of_household', 'professionalrollup', 'premover', 'active_fitness_interest', 'golf_interest', 'traveler', 'green_advocate'), mobileid['subid'] == dgx['source_dfx_id'], 'left') apps = hive_context.read.parquet(APPS_PATH) photography_user_augmented = photography_user_augmented \ .join(apps, photography_user_augmented['device_id'] == apps['deviceId'], 'left') photography_user_augmented.write.csv('s3://' + output_path, mode='overwrite', compression='gzip', header=True)
# Some hospitals have too few non-NA measure. To have a fair ranking, we want to set a min. bar # on the # of non-NA measure for our hospitals to participate in our evaluation. # For each hospital, find out the # of non-NA measure it has nonNAMeasureCount = dict(df_total_quality.map(lambda r: (r.providerid, r.normalizedscore)). combineByKey( # Use combineByKey to count the # of non-NA Measure lambda value: 0 if value is None else 1, lambda x, value: x if value is None else x + 1, lambda x, y: x + y).collect()) # Find the 25th percentile of non-NA measure, and this will be the min-bar of # of non-NA measure. minMeasureCount = np.percentile(nonNAMeasureCount.values(), 25.) df_hospitals = sqlContext.table("hospitals") # For the purpose of evaluation, we keep only those hospitals which meet the bar hospitals_qualified = df_hospitals.map(lambda r: (r.providerid, r.hospitalname, r.state, bool(nonNAMeasureCount[r.providerid] >= minMeasureCount if nonNAMeasureCount.has_key(r.providerid) else False))) schema = StructType([ StructField("providerid", StringType(), True), StructField("hospitalname", StringType(), True), StructField("state", StringType(), True), StructField("qualified", BooleanType(), True)]) df_hospitals_qualified = sqlContext.createDataFrame(hospitals_qualified, schema) saveAsHiveTable(df_hospitals_qualified, "hospitals_qualified")
if __name__ == "__main__": if len(sys.argv) != 4: print "Error usage: CreateHive [master] [inputFile] [inputTable]" sys.exit(-1) master = sys.argv[1] inputFile = sys.argv[2] inputTable = sys.argv[3] sc = SparkContext(master, "CreateHive") hiveContext = HiveContext(sc) # create hive table hiveContext.sql( "CREATE TABLE IF NOT EXISTS default." + inputTable + " (a int, b string, c string)") # loading data into hive table hiveContext.sql( "LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable) # read data from hive table sampleData = hiveContext.table("default."+inputTable) sampleData.show() # running sql on hive table sampleData.registerTempTable("test_temp") hiveContext.sql("select * from test_temp").show()
errors='coerce') supervised_pd[keyvars] = supervised_pd[keyvars].astype(str) #remove alerts generated after case supervised_pd = supervised_pd.loc[~( supervised_pd.ALERT_CREATE_DATE > supervised_pd.CASE_CREATE_DATE)] #use alert data to filter txn data to reduce computation/memory supervised_pd_filter = supervised_pd[['FINAL_ACCOUNT_KEY', 'ALERT_MONTH_SK']].drop_duplicates() supervised_pd_filter.columns = ['account_key', 'alert_month_sk'] supervised_filter = hive_context.createDataFrame(supervised_pd_filter) #generate monthly txn summary sam_txn = hive_context.table('udm_cds_transactions1023').where( "month_sk >=216 and month_sk <= 226").withColumn( 'abs_value', F.abs(col('acct_curr_amount'))) sam_acct = hive_context.table('udm_cds_account0822').where( "is_error_account is null").dropDuplicates() sam_txn_acctsmry = sam_txn.where("acct_curr_amount<>0").groupBy(["account_sk","month_sk"])\ .agg(F.sum('abs_value').alias('total_value'),CD('transaction_key').alias('total_volume')).alias('t')\ .join(sam_acct.alias('a'),col('t.account_sk')==col('a.entity_sk'),'left').selectExpr('a.account_key',"t.*").alias('t2')\ .join(supervised_filter.alias('s'),[col('t2.account_key')==col('s.account_key'),col("t2.month_sk") +1 == col("s.alert_month_sk")], "inner")\ .selectExpr("t2.*","s.alert_month_sk").distinct() sam_txn_acctsmry_pd = sam_txn_acctsmry.toPandas() #merge alert data and txn summary supervised_pd_common = supervised_pd.rename(index=str,columns={"ACCOUNT_KEY": "FORT_ACCOUNT_KEY","FINAL_ACCOUNT_KEY":"account_key","ALERT_MONTH_SK":"alert_month_sk"})\ .merge(sam_txn_acctsmry_pd, on = ['account_key','alert_month_sk'], how='inner') supervised_pd_common.columns = [ colu.lower() for colu in supervised_pd_common.columns
from pyspark import SparkContext from pyspark.sql import HiveContext, DataFrame, Column, Window, DataFrameWriter from pyspark.sql.functions import rank, col from datetime import timedelta, datetime import pytz import os sc = SparkContext() hc = HiveContext(sc) # table 1 table_lot_history = hc.table("prod_mti_ww_be_idl.tte_2did_lot_history_view") table_lot_history.registerTempTable("table_lot_history") # table 2 table_machine_attr = hc.table("prod_mti_ww_be_idl.tte_2did_machine_attr_view") table_machine_attr.registerTempTable("table_machine_attr") # table 3 table_lot_relation = hc.table("prod_mti_ww_be_idl.tte_2did_lot_relation_view") table_lot_relation.registerTempTable("table_lot_relation") # table 4 table_comp_history = hc.table("prod_mti_ww_be_idl.tte_2did_comp_history_view") table_comp_history.registerTempTable("table_comp_history") ctz = pytz.timezone('Singapore') path_root = '/eng/mti/ww/be/msb/assembly_quality/twodid' for n in range(1, 2): date = (datetime.now(tz=ctz) - timedelta(days=n)).strftime("%Y-%m-%d") for t in (1, 2): if t == 1: time_boundary_1 = date + ' ' + '00:00:00.000' time_boundary_2 = date + ' ' + '11:59:59.999'
"IsSystemApp": False, "Name": "Uber", "VersionCode": 13 }] } if __name__ == "__main__": (from_date, to_date, target_path, external_lib) = get_parameters() sc = SparkContext(appName="telemetries", pyFiles=external_lib) sql_context = HiveContext(sc) from telemetries.create_user_apps import load_sent_events from flaten_mixpanel_home_events.pyspark_schema_utils import rdd_to_df tele_sprint = sql_context.table("l1_sprint.telemetry_events") events_sprint = sql_context.table("l2_sprint.mixpanel_home") apps_data_rdd = tele_sprint.filter((tele_sprint.event_date >= '2018-05-01') & (tele_sprint.event_date <= '2018-05-01 ')) \ .filter((tele_sprint.event_name == 'apps') | (tele_sprint.event_name == 'systemapps')) \ .rdd.map(lambda x: (x.event_name , x.os, x.uuid, x.event_date, json.loads(x.json_data), x.agentversion)) \ .filter(lambda x: x[4].get('IsTest', True) == False) \ .map(lambda x: Row(carrierName=x[4]['CarrierName'], deviceId=x[4]['DeviceId'], deviceModel=x[4].get('DeviceModel',None), deviceVendor=x[4].get('DeviceVendor',None), agentTimestamp=x[4].get('AgentTimestamp',None), agentVersion=x[5], telemtryType=x[0], os=x[1], uuid=x[2], date=x[3],
# Created by Raju Kumar Mishra # Book PySpark Recipes # Chapter 8 # Recipe 8-8. Reading data from Apache Hive. # Run following PySpark code lines, line by line in PySpark shell #Step 8-8-1. Creating HiveContext object. from pyspark.sql import HiveContext ourHiveContext = HiveContext(sc) #Step 8-8-2. Reading table data from Hive. FilamentDataFrame = ourHiveContext.table('apress.filamenttable') FilamentDataFrame.show(5)
############################################################################################################### # # Spark - Execute Job Against Hive Table # ############################################################################################################### from pyspark.sql import HiveContext hive_context = HiveContext(sc) hive_context.sql('show tables').show(25,False) sample = hive_context.table("mm_teams_sql") sample.show(10,False) ############################################################################################################### # # Spark - Execute Job Against Phoenix Table # ############################################################################################################### # NOTE: Just an example - Not used for IAA Module /usr/hdp/2.5.0.0-1245/spark2/bin/pyspark --jars /usr/hdp/2.5.0.0-1245/phoenix/lib/phoenix-spark-4.7.0.2.5.0.0-1245.jar /usr/hdp/2.5.0.0-1245/spark2/bin/pyspark --jars /usr/hdp/2.5.0.0-1245/phoenix/phoenix-4.7.0.2.5.0.0-1245-client.jar
# # survey_resp.py # Extract useful data from survey_responses table and store in a parquet file # from pyspark import SparkContext from pyspark.sql.types import * from pyspark.sql.functions import * from pyspark.sql import HiveContext sc = SparkContext("local", "Exercise1") hiveCX = HiveContext(sc) # bring the table into a data frame dfSurveyResp = hiveCX.table("survey_responses") # select out the relevent info into a new dataframe dfSurveyRespNew = dfSurveyResp.select( col("provider_number").alias("providerID"), col("hcahps_base_score").cast(DecimalType()).alias("baseScore"), col("hcahps_consistency_score").cast( DecimalType()).alias("consistencyScore")).where( col("baseScore").isNotNull() & col("consistencyScore").isNotNull()) # write the dataframe out as a parquet file dfSurveyRespNew.write.parquet("/user/w205/hospital_compare/surveyRespParquet")
print correlation most_common_weaknesses = sql_context.sql("select weakness, count(*) as times " "from (" "select explode(weaknesses) as weakness " "from pokemons " ") weaknesses " "group by weakness " "order by times desc " "limit 3" ) normal_pokemons_not_in_eggs = sql_context.sql("select * from " "pokemons " "where array_contains(type, 'Normal') and egg='Not in Eggs'") print "3 most common weaknesses: {}".format(most_common_weaknesses.collect()) print "Normal pokemons not in eggs: {}".format(normal_pokemons_not_in_eggs.collect()) hive_context = HiveContext(spark_context) #hive_context.setConf("hive.warehouse.dir", "/Users/adrian/Documents/hive-warehouse") normal_pokemons_not_in_eggs.write.mode("overwrite").saveAsTable("default.normal_pokemons") # Show the content of the Hive table we've just created pokemons_normal_table = hive_context.table("default.normal_pokemons") pokemons_normal_table.show() hive_context.read.table("default.normal_pokemons") print "Normal Pokemons from Hive table: {}".format(sql_context.sql("select * from normal_pokemons").collect())
from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import SparkSession from pyspark.sql import HiveContext from pandas import * sc = SparkContext("local[*]", "RentalData") ssc = StreamingContext(sc, 5) ss = SparkSession.builder \ .appName(sc.appName) \ .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \ .config("hive.metastore.uris", "thrift://localhost:9083") \ .enableHiveSupport() \ .getOrCreate() hive_context = HiveContext(sc) df = hive_context.table("default.rentals") df.toPandas().to_csv('mycsv.csv') print(rentals) ssc.start() ssc.awaitTermination()
def tok_str(text, ngrams=1, minChars=2): text = re.sub(r'\s+', ' ', text) # change any whitespace to regular space tokens = map(unicode, text.lower().split(' ')) # split into tokens and change to lower case tokens = filter(lambda x: len(x)>=minChars and x[0]!='@', tokens) # remove short words and usernames tokens = ["URL" if t[:4]=="http" else t for t in tokens] # repalce any url by the constant word "URL" tokens = [punct.sub('', t) for t in tokens] # remove punctuation from tokens if ngrams==1: return tokens else: return tokens + [' '.join(tokens[i:i+ngrams]) for i in xrange(len(tokens)-ngrams+1)] tokenize = F.udf(lambda s: tok_str(unicode(s),ngrams=2), ArrayType(StringType())) # Load sentiment dictionary wv = hc.table('sentiment_words').collect() wordlist = dict([(r.word,r.score) for r in wv]) # get positive sentiment scores from words RDD using word-list def pscore(words): scores = filter(lambda x: x>0, [wordlist[t] for t in words if t in wordlist]) return 0.0 if len(scores)==0 else (float(sum(scores))/len(scores)) pos_score = F.udf(lambda w: pscore(w), FloatType()) # get negative sentiment scores from words RDD using word-list def nscore(words): scores = filter(lambda x: x<0, [wordlist[t] for t in words if t in wordlist]) return 0.0 if len(scores)==0 else (float(sum(scores))/len(scores)) neg_score = F.udf(lambda w: nscore(w), FloatType()) # Create feature matrix for the model
if to_date is None: to_date = get_date_n_days_ago(1) if to_date == from_date: to_date = (date_parser.parse(to_date) + timedelta(days=1)).strftime('%Y-%m-%d') print('from_date = {} | to_date = {}'.format(from_date, to_date)) from flaten_mixpanel_home_events.data_provider import load_sent_events, load_received_events, load_dismiss_events, \ load_view_events, \ load_time_on_page_events, load_page_scroll_events, load_feedbackmodule_impression_events, \ load_feedbackmodule_click_events, load_click_events, join_all_events from flaten_mixpanel_home_events.events_schema import sent_record_example, flatten_data_record_example from flaten_mixpanel_home_events.pyspark_schema_utils import rdd_to_df events = sqlContext.table("l2_sprint.mixpanel_home") sent_events = load_sent_events(events, from_date, to_date, sent_record_example, sqlContext) \ .drop_duplicates(subset=['message_id']) received_events = load_received_events(events, from_date, to_date) \ .drop_duplicates(subset=['received_message_id']) dismiss_events = load_dismiss_events(events, from_date, to_date) \ .drop_duplicates(subset=['dismiss_message_id']) view_events = load_view_events(events, from_date, to_date) \ .drop_duplicates(subset=['view_message_id']) time_on_page_events = load_time_on_page_events(events, from_date, to_date) \ .drop_duplicates(subset=['time_on_page_message_id']) page_scroll_events = load_page_scroll_events(events, from_date, to_date) \ .drop_duplicates(subset=['page_scroll_message_id']) feedbackmodule_impression_events = load_feedbackmodule_impression_events(events, from_date, to_date) \ .drop_duplicates(subset=['feedback_module_impression_message_id']) feedbackmodule_click_events = load_feedbackmodule_click_events(events, from_date, to_date) \
sc = SparkContext("local", "best_hospitals") from pyspark.sql import HiveContext sqlContext = HiveContext(sc) # Select the top 10 hospital by average avgscore # Please note that we filter out those hospital not qualified for evaluation df_top10_hospitals = sqlContext.sql("select Q.providerid as id, AVG(Q.normalizedscore) as avgscore \ from total_quality Q join hospitals_qualified H on Q.providerid = H.providerid \ where Q.normalizedscore is not null and H.qualified = true \ group by Q.providerid \ order by avgscore DESC").limit(10) # Join with hospitals_qualified to get the hospital name and state # Note: couldn't figure out how to do it in the above select statement (together with Group By) in one-shot! :-( df_hospitals = sqlContext.table("hospitals_qualified") df_top10_hospitals_full = df_top10_hospitals.join(df_hospitals, df_top10_hospitals.id == df_hospitals.providerid).\ select(df_hospitals.providerid, df_hospitals.hospitalname, df_hospitals.state, df_top10_hospitals.avgscore) df_top10_hospitals_full = df_top10_hospitals_full.orderBy(df_top10_hospitals_full.avgscore.desc()) # Save it as a table df_top10_hospitals_full.registerTempTable("df") sqlContext.sql("drop table if exists top_10_hospitals") sqlContext.sql("CREATE TABLE top_10_hospitals AS SELECT * FROM df") print print "Top 10 hospitals" print rank = 1 for i in df_top10_hospitals_full.collect():
# importing required packages from pyspark.sql import HiveContext from pyspark.sql.types import * from pyspark import SparkContext from pyspark.sql import Row #import numpy as np # seeting up spark context and hive context sc = SparkContext("local", "Simple App") sqlCtx = HiveContext(sc) # creating a spark data frame using the hive table effective_care df_raw = sqlCtx.table("effective_care") print 'Number of rows in the table {0}'.format(df_raw.count()) # removing all row not containing numbers for score variable # function to test if a string can be parsed in integer or not def CheckValidScore(s): try: int(s) return True except ValueError: return False # creating a RDD by filtering out invalid scores df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score)) # cretating datframe from the RDD df_clean = sqlCtx.createDataFrame(df_clean_rdd) print 'Number of rows in table after cleaning {0}'.format(df_clean.count())
# Probar el codigo en CLOUDERA: # Texto original from pyspark.sql import HiveContext from pyspark.sql.functions import * import ConfigParser from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("Prueba_1") sc = SparkContext.getOrCreate(conf=conf) sqlContext = HiveContext(sc) df1 = sqlContext.sql("SELECT * FROM dwhprod.cen_identificacion LIMIT 100") df1.show() df2 = sqlContext.table("dwhprod.cen_identificacion") df2.show() # Guargadado de tablas df1.write.mode("overwrite").saveAsTable("dwhprod.base_nueva") sc.stop() # JOINS EN IMPALA # --SELECT * FROM dwhprod.cenpersona LIMIT 100 #--SELECT * FROM dwhprod.cenactividad LIMIT 100 #--fiactividad
"/user/hive/warehouse") \ .config("hive.metastore.uris", "thrift://localhost:9083") \ .enableHiveSupport() \ .getOrCreate() kafkastream = KafkaUtils.createStream(ssc, "localhost:2181", "SW", {"SW": 1}) parsed = kafkastream.map(lambda x: json.loads(x[1])) content = parsed.map(lambda x: x.get("content")) \ .flatMap(lambda x: x.get("properties")) \ .map(lambda x: (x.get("id"), x.get("abbreviation"), x.get("city"), x.get("conference"), x.get("division"), x.get("full_name"), x.get("name"))) content.foreachRDD(Process) ## Create a new DF based on NBA teams in the Southwest Division hive_context = HiveContext(sc) df = hive_context.table("default.SW") new = df.select("id", "conference" "division") \ .withColumn("city", "name") \ .drop(col("full_name")) \ .show() ssc.start() ssc.awaitTermination()
from pyspark.sql.types import * from pyspark.sql import SQLContext from pyspark.sql.functions import col from pyspark.sql import HiveContext import datetime #create spark context #conf = SparkConf().set("spark.sql.warehouse.dir","/apps/hive/warehouse/") #sc=SparkContext(conf=conf) sqlContext = SQLContext(sc) #creating hive context to fetch data from hive tables hive_context = HiveContext(sc) #loading the data from hive tables into spark dataframes df_retail_sales = hive_context.table("retail_project.retail_sales") df_retail_stores = hive_context.table("retail_project.retail_stores") df_retail_features = hive_context.table("retail_project.retail_features") #converting data frame to temporary tables df_retail_sales.createOrReplaceTempView("spark_tab_retail_sales") df_retail_stores.createOrReplaceTempView("spark_tab_retail_stores") df_retail_features.createOrReplaceTempView("spark_tab_retail_features") #Query-01 #the department-wide sales for each store query_01 = open( "/home/reallegendscorp9155/pyspark_project_retail/query_files/query_01.txt" ) query01 = query_01.read()
import pyspark from pyspark.sql import HiveContext if __name__ == '__main__': config = pyspark.SparkConf().setAppName("Basico") sc = pyspark.SparkContext(conf=config) hive_context = HiveContext(sc) log = hive_context.table("CURSOBIGDATA.apachelog") log.show()
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql.functions import col from pyspark.sql.functions import array conf = SparkConf() sc = SparkContext(conf=conf) hive_context = HiveContext(sc) combined = hive_context.table("flight.flight_data_denorm") carrier_desc = "carrier_desc" origin_desc = "origin_desc" dest_desc = "dest_desc" #print combined.select(col("origin_dest_names").getItem(0)).head(3) #combined.groupBy("ORIGIN_DESC").avg("DEP_DELAY").withColumnRenamed("avg(DEP_DELAY)", "avg_dep_delay").sort(col("avg_dep_delay").desc()).head(5) combined = combined.groupBy( col("origin_dest_names").getItem(0)).avg("dep_delay").withColumnRenamed( "avg(dep_delay)", "avg_dep_delay").sort(col("avg_dep_delay").desc()).limit(5) combined.write.format("csv").save("file:///root/flight_data/query1.csv")
need_state = sqlContext.sql(""" SELECT A.*,B.NS_HARM FROM (SELECT A.*,CONCAT('_',GROUP_CODE,'_',DUNN_CAT_ENGLISH_1) AS NEW_CAT FROM x5_ru_analysis.ak_prod_ovr_1 A) A INNER JOIN X5_RU_ANALYSIS.AK_NS_MAP_OVR_SPLIT B ON A.NEW_CAT = B.KARU_CATEGORY_FINAL """) need_state = need_state.withColumn("need_state", F.upper(compress("NS_HARM"))) need_state = need_state.select("product_code", "need_state") # need_state = need_state.withColumn("format_code",F.upper(compress("format_code"))) ''' rule_mapping = sqlContext.createDataFrame(pd.read_excel("X5_lookup_micro_business_v3.xlsx")) rule_mapping.write.saveAsTable("ak_micro_ns_x5_lookup_business_3", mode='overwrite') ''' rule_mapping = sqlContext.table("ak_micro_ns_x5_lookup_business_3") #rule_mapping = sqlContext.table("ak_micro_ns_x5_lookup ") for i in [ "need_state", "Micro_qualifier_2", "Micro_qualifier_3_5", "Micro_qualifier_6_7" ]: rule_mapping = rule_mapping.withColumn(i, F.upper(compress(i))) df_final = sqlContext.sql(""" SELECT DISTINCT BASKET_ID AS TRANSACTION_CODE, NS_HARM AS NEED_STATE, ITEM_SPEND AS NET_SPEND_AMT, CATEGORY_DESC_ENG AS PROD_HIER_L30_CODE, A.PRODUCT_CODE AS CATEGORY_NAME FROM AK_TRANS_52weeks A
def sql_hive_context_example(spark): # create hive context object. hive_ctx = HiveContext(spark.sparkContext) # createDataFrame l = [('Alice', 18), ('Bob', 20), ('Charley', 22)] df = hive_ctx.createDataFrame(l, ('name', 'age')) print("createDataFrame API finished") # registerDataFrameAsTable hive_ctx.registerDataFrameAsTable(df, "table1") print("registerDataFrameAsTable API finished") # sql tmp_df = hive_ctx.sql("select * from table1") tmp_df.show() print("sql API finished") # table tmp_df = hive_ctx.table("table1") tmp_df.show() print("table API finished") # tableNames table_names = hive_ctx.tableNames() print(table_names) print("tableNames API finished") # tables tables = hive_ctx.tables() print(tables) print("tables API finished") # range tmp_df = hive_ctx.range(1,10,2) tmp_df.show() print("range API finished") # dropTempTable hive_ctx.dropTempTable("table1") table_names = hive_ctx.tableNames() print(table_names) print("dropTempTable API finished") # cacheTable & uncacheTable & clearCache df = hive_ctx.range(1,10,2) hive_ctx.registerDataFrameAsTable(df, "table") hive_ctx.cacheTable("table") hive_ctx.uncacheTable("table") hive_ctx.clearCache() print("cacheTable & uncacheTable & clearCache API finished") # createExternalTable # newSession # registerFunction # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead # registerJavaFunction # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead # setConf & getConf hive_ctx.setConf("key1", "value1") value = hive_ctx.getConf("key1") print(value) print("setConf & getConf API finished") # refreshTable # Exception: An error occurred while calling o26.refreshTable: # Method refreshTable([class java.lang.String]) does not exist print("Finish running HiveContext API")
logException("Error in reading YAML file in TableDataAnalysisForDuplicacy.py"+" "+str(e)) # Create a 2D array for the features from the input dataframe, which has to be tested input_features_arr = inputtablevalues() # Create the array for the input columns from the to be tested file columnarray=[] columnarray.append(filecolumnstomatch) try: # Read each table from the file which has been created after column matching readfile = open(filepath+"colmatchingtables.txt") while True: line = readfile.readline() tableName = line.replace(filenameContent, '') # Create the sql dataframe from Hive table data = sqlContext.table(tableName) # Select the columns which are not string type; remove them and create a numerical only dataframe columnList = [item[0] for item in data.dtypes if item[1].startswith('string') == False] data_number_only = data.selectExpr(columnList).cache() # Fill the NA values with 0 for all columns data_number_only = data_number_only.fillna(0) # Get the approx row count cntInterval = data_number_only.rdd.countApprox(timeout=300000, confidence=.1) # Create the sampled dataframe if row count is more than a million if cntInterval > 1000000: # consider 20% data of equal probability weight (None) with no replacement (False) sampledData = data_number_only.sample(False, .2, None) elif cntInterval > 10000000: sampledData = data_number_only.sample(False, .1, None) else: sampledData = data_number_only
from __future__ import print_function !echo $PYTHON_PATH import os, sys #import path from pyspark.sql import * # create spark sql session myspark = SparkSession\ .builder\ .config("spark.executor.instances", 3 ) \ .config("spark.executor.memory", "3g") \ .config("spark.executor.cores", 2) \ .config("spark.scheduler.listenerbus.eventqueue.size", 10000) \ .config("spark.sql.parquet.compression.codec", "snappy") \ .appName("Sample_07_kmeans") \ .getOrCreate() sc = myspark.sparkContext print ( myspark) myspark.sql("SET spark.sql.parquet.binaryAsString=true") from pyspark.sql import HiveContext hive_context = HiveContext(sc) myview = hive_context.table("default.sample_07p") myview.show(5)
from pyspark import SparkContext from pyspark.sql import HiveContext import time if __name__ == "__main__": sc = SparkContext(appName="Link Filtering Hive v1") hc = HiveContext(sparkContext=sc) tbl = hc.table("mi2mi.edges") tbl.registerTempTable("edges") # ties = hc.sql("select MilanoDate, sid1, sid2, 1 - (select count(distinct sid1) - 2 from edges where MilanoDate=e.MilanoDate)*(EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1) + pow(1 - EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1), (select count(distinct sid1) - 1 from edges where MilanoDate=e.MilanoDate))) /((select count(distinct sid1) - 1 from edges where MilanoDate=e.MilanoDate) * (EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1) - 1)) alpha from edges e") ties = hc.sql("select MilanoDate, sid1, sid2, pow(1 - EdgeCost/(select sum(EdgeCost) node_strength from edges where sid1 != sid2 and MilanoDate=e.MilanoDate and sid1 = e.sid1 group by sid1), (select count(distinct sid1) - 1 from edges where MilanoDate=e.MilanoDate)) alpha from edges e where sid1 != sid2 order by MilanoDate, sid1, sid2") ties.write.format("orc").saveAsTable("mi2mi.LinkFiltering") alfa_value = [0.01, 0.05, 0.001] ties.filter(ties.alpha < 0.05).show()
from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier # Initialize Spark SparkContext.setSystemProperty('spark.executor.memory', '4g') conf = SparkConf() conf.set('spark.executor.instances', 20) sc = SparkContext('yarn-client', 'kdd99', conf=conf) hc = HiveContext(sc) kdd = hc.table("kdd99") (trainData, testData) = kdd.randomSplit([0.7, 0.3], seed=42) trainData.cache() services = trainData.withColumnRenamed('service', 'srvc').select('srvc').distinct() testData = testData.join(services, testData.service == services.srvc) # filter out any rows with a service not trained upon testData.cache() print "training set has " + str(trainData.count()) + " instances" print "test set has " + str(testData.count()) + " instances" # Build model inx1 = StringIndexer(inputCol="protocol", outputCol="protocol-cat") inx2 = StringIndexer(inputCol="service", outputCol="service-cat") inx3 = StringIndexer(inputCol="flag", outputCol="flag-cat") inx4 = StringIndexer(inputCol="is_anomaly", outputCol="label")
from pyspark import SparkContext, SparkConf from pyspark.sql import Row from pyspark.sql import HiveContext import pyspark.sql.functions as f sc = SparkContext("local", "NBA Stats") hive_context = HiveContext(sc) data = hive_context.table("default.nbahive") #1 Max Height maxheight = data.groupBy().max('Height').withColumnRenamed( "max(Height)", "max_height") #maxheight.show() result1 = data.join(maxheight, data.height == maxheight.max_height, "inner").select(data.playername, data.height).distinct() result1.show() result1.write.mode("overwrite").saveAsTable("default.tallest") #2 Min Weight minweight = data.groupBy().min('Weight').withColumnRenamed( "min(Weight)", "min_weight") #minweight.show() result2 = data.join(minweight, data.weight == minweight.min_weight, "inner").select('playerName', 'Weight').distinct() result2.show() result2.write.mode("overwrite").saveAsTable("default.minWeight") #3 Most Team Wins wins = data.select('gmDate', 'teamAbbr', 'Result').filter(data.result == 'Win').distinct()
from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession, HiveContext hive_context = HiveContext(sc) pubg = hive_context.table("pubg_new") #Selecting columns of interest pubg = pubg.select('boosts', 'damagedealt', 'dbnos', 'headshotkills', 'heals', 'killplace', 'killpoints', 'kills', 'killstreaks', 'longestkill', 'maxplace', 'numgroups', 'revives', 'ridedistance', 'roadkills', 'swimdistance', 'teamkills', 'vehicledestroys', 'walkdistance', 'weaponsacquired', 'winpoints', 'winplaceperc') pubg.show(10) pubg.printSchema() pubg.cache() from pyspark.ml.feature import VectorAssembler #Creating feature vector vectorAssembler = VectorAssembler(inputCols=[ 'boosts', 'damagedealt', 'dbnos', 'headshotkills', 'heals', 'killplace', 'killpoints', 'kills', 'killstreaks', 'longestkill', 'maxplace', 'numgroups', 'revives', 'ridedistance', 'roadkills', 'swimdistance', 'teamkills', 'vehicledestroys', 'walkdistance', 'weaponsacquired', 'winpoints', 'winplaceperc' ], outputCol='features') #Transforming the dataframe
help="train|inference", default="train") parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) args = parser.parse_args() print("args:", args) # read data # input_data = sc.textFile(args.input).map(lambda ln: [float(x) for x in ln.split(',')]) # change the input data to hive table # input_data = hive_context.table(args.input).map(lambda row: [float(x) for x in row]) # import hiveContext.sql # import hiveContext.implicits._ # input_data = hive_context.sql("select * from "+args.input).map(lambda row: [float(x) for x in row]) input_data = hive_context.table( args.input).map(lambda row: [float(x) for x in row]) # input_data = sqlContext.sql("select * from "+args.input).map(lambda row: [float(x) for x in row]) cluster = TFCluster.run(sc, map_fun, args, num_executors, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) labelRDD = cluster.inference(input_data).map(lambda x: (x, )) # infer the schema schema_rdd = hive_context.createDataFrame(labelRDD, ['predict']) # save schema_rdd.saveAsTable(args.output) cluster.shutdown()
from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier # Initialize Spark SparkContext.setSystemProperty("spark.executor.memory", "4g") conf = SparkConf() conf.set("spark.executor.instances", 20) sc = SparkContext("yarn-client", "kdd99", conf=conf) hc = HiveContext(sc) kdd = hc.table("kdd99") (trainData, testData) = kdd.randomSplit([0.7, 0.3], seed=42) trainData.cache() services = trainData.withColumnRenamed("service", "srvc").select("srvc").distinct() testData = testData.join(services, testData.service == services.srvc) # filter out any rows with a service not trained upon testData.cache() print "training set has " + str(trainData.count()) + " instances" print "test set has " + str(testData.count()) + " instances" # Build model inx1 = StringIndexer(inputCol="protocol", outputCol="protocol-cat") inx2 = StringIndexer(inputCol="service", outputCol="service-cat") inx3 = StringIndexer(inputCol="flag", outputCol="flag-cat") inx4 = StringIndexer(inputCol="is_anomaly", outputCol="label") ohe2 = OneHotEncoder(inputCol="service-cat", outputCol="service-ohe")
# # procedures.py # Extract useful data from effective_care and readmissions tables and store in a parquet file # from pyspark import SparkContext from pyspark.sql.types import * from pyspark.sql.functions import * from pyspark.sql import HiveContext sc = SparkContext("local", "Exercise1") hiveCX = HiveContext(sc) # bring the table into a data frame dfEffCare = hiveCX.table("effective_care") dfReadmissions = hiveCX.table("readmissions") # select out the relevent info into a new dataframe dfEffCareNew = dfEffCare.withColumn( "score", regexp_replace("score", "Low.*", "1")).withColumn( "score", regexp_replace("score", "Medium.*", "2")).withColumn( "score", regexp_replace("score", "High.*", "3")).withColumn( "score", regexp_replace("score", "Very High.*", "4")).select( col("provider_id").alias("providerID"), col("measure_id").alias("measureID"), col("score").cast(DecimalType()).alias("score")).where( col("score").isNotNull()) dfReadmissionsNew = dfReadmissions.select( col("provider_id").alias("providerID"), col("measure_id").alias("measureID"), col("score").cast(DecimalType()).alias("score")).where(
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql.functions import col from pyspark.sql.functions import array from pyspark.sql.functions import collect_list from pyspark.sql.functions import concat_ws conf = SparkConf() sc = SparkContext(conf=conf) hive_context = HiveContext(sc) airlines = hive_context.table("flight.flight_data_orc2") airports = hive_context.table("flight.airport_lookup") carriers = hive_context.table("flight.carrier_lookup") carrier_desc = "carrier_desc" origin_desc = "origin_desc" dest_desc = "dest_desc" #we do not observe any duplicates on airport lookup airports = airports.dropDuplicates(['code']) #we group the carriers over code and create a list of all possible descriptions. Then, we concat the descriptions as a string with '||' as a separator carriers = carriers.groupBy("code").agg( collect_list(carriers.description).alias('new_desc')).select( [col("code"), concat_ws(" || ", col("new_desc")).alias("description")]) air_car = airlines.join(carriers, airlines.carrier == carriers.code).select( [a for a in airlines.columns] + [carriers.description.alias(carrier_desc)])
# importing required packages from pyspark.sql import HiveContext from pyspark.sql.types import * from pyspark import SparkContext from pyspark.sql import Row # seeting up spark context and hive context sc = SparkContext("local", "Simple App") sqlCtx = HiveContext(sc) # creating a spark data frame using the hive table effective_care df_raw = sqlCtx.table("effective_care") print 'Number of rows in the table {0}'.format(df_raw.count()) # removing all row not containing numbers for score variable # function to test if a string can be parsed in integer or not def CheckValidScore(s): try: int(s) return True except ValueError: return False # creating a RDD by filtering out invalid scores df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score)) # cretating datframe from the RDD df_clean = sqlCtx.createDataFrame(df_clean_rdd) print 'Number of rows in table after cleaning {0}'.format(df_clean.count()) # converting the data types for score column
# By default, sparkcontext (sc) is initialised, hence, line number 13 is commented out. # Make necessary imports import datetime from itertools import chain from pyspark import SparkContext from pyspark.sql import HiveContext from pyspark.sql.functions import create_map, lit, max # Initialise sparkcontext and hivecontext # sc = SparkContext("local", "First App") # Check NOTE 👆 hive_context = HiveContext(sc) current_date = datetime.datetime.now().strftime('%Y-%m-%d') # Create a data frame with the raw data available in Hive clickstream_funnel_df = hive_context.table("default.funnel_clickstreams") # Assign a code to the various pages. This code will also act as the 'likeliness' of the user getting converted. # A higher numbered code implies that the user is more likely to buy the ticket. page_code = {'listing': 1, 'review': 2, 'payments': 3, 'thankyou': 4} mapping_expr = create_map([lit(x) for x in chain(*page_code.items())]) # Add a new column 'conversion_likeliness' to the dataframe and filter out past searches conversion_likeliness_data_df = clickstream_funnel_df.withColumn( 'conversion_likeliness', mapping_expr[clickstream_funnel_df['page_name']]) conversion_likeliness_data_df = conversion_likeliness_data_df.filter( conversion_likeliness_data_df.departure_date > current_date).alias('df') # Select unique searches by the user and fetch its max 'likeliness' to be converted. unique_searches_df = conversion_likeliness_data_df. \ groupBy('userid', 'origin', 'destination', 'departure_date'). \
# importing required packages from pyspark.sql import HiveContext from pyspark.sql.types import * from pyspark import SparkContext from pyspark.sql import Row from pyspark.mllib.stat import Statistics # seeting up spark context and hive context sc = SparkContext("local", "Simple App") sqlCtx = HiveContext(sc) # creating a spark data frame using the hive tables survey_response and effective_care df_survey_raw = sqlCtx.table("survey_response") df_care_raw = sqlCtx.table("effective_care") print 'Number of rows in the survey table {0}'.format(df_survey_raw.count()) print 'Number of rows in the effective_care table {0}'.format(df_care_raw.count()) # removing all row not containing numbers for score variable # function to test if a string can be parsed in integer or not def CheckValidScore(s): try: int(s) return True except ValueError: return False # creating a RDD by filtering out invalid scores df_survey_clean_rdd = df_survey_raw.rdd.filter(lambda row: CheckValidScore(row.hcahps_base_score)) df_care_clean_rdd = df_care_raw.rdd.filter(lambda row: CheckValidScore(row.score))