for i in a.keys(): b = a[i] if "{" in str(b): key_list = key_list + [i] return (key_list) ignore_list_1 = [ 'entitylogicalname', 'SinkCreatedOn', 'SinkModifiedOn', 'messageid', 'sourcesystem', 'Id', 'entitydata' ] ignore_list = [col.lower() for col in ignore_list_1] sqlContext.sql("use dsc60263_fsm_tz_db") for m in sqlContext.tables("dsc60263_fsm_tz_db").select('tableName').where( "tableName not like '%dup%'").where( "tableName not like '%bkp%'").where( "tableName not like '%temptz%'").collect(): k = m.asDict().values()[0].encode('utf-8') v1 = sqlContext.table(k).filter( "to_date(lastupdatedatetime) = date_sub(CAST(current_timestamp() as DATE), 1)" ) y1 = v1.select("message__id").distinct() if y1.count() > 0: v2 = list(v1.select("message__id").toPandas()["message__id"]) for msg_id in v2: print(str(msg_id).strip() + "\\n") sys.exit(0)
from pyspark import SparkConf,SparkContext from pyspark import HiveContext from pyspark.sql.types import DoubleType from pyspark.sql.types import IntegerType from pyspark.sql import functions as func hive_context = HiveContext(sc) # load data procedure = hive_context.table("default.procedure") hospitals = hive_context.table("default.hospital") measures = hive_context.table("default.measure") surveys = hive_context.table("default.survey") # cast types procedure_typecast = procedure.withColumn("score", procedure["score"].cast(DoubleType())).withColumn("sample",procedure["sample"].cast(IntegerType())).withColumn("denominator",procedure["denominator"].cast(IntegerType())) surveys_typecast = surveys.withColumn("hcahps_base",surveys["hcahps_base"].cast(IntegerType())).withColumn("hcahps_consistency",surveys["hcahps_consistency"].cast(IntegerType())) surveys_selected = surveys_typecast.select('provider_id','hcahps_base','hcahps_consistency') procedures_selected = procedure_typecast.select('provider_id','sample','score','measure_id') # calculate hospital average scores hospital_avg = procedure_typecast.where((procedure_typecast['score']<100)&(procedure_typecast['sample']>50)).groupby('provider_id').agg(func.avg('score')) hosp_avg_surv =hospital_avg.join(surveys_selected,surveys_selected.provider_id==hospital_avg.provider_id) # calculate correlation of hospital average score with HCAHPS base score hosp_avg_surv.stat.corr('avg(score)','hcahps_base') # -0.09947309348326736 hosp_avg_surv.stat.corr('avg(score)','hcahps_consistency') # 0.03405962378975164 measure_avg = procedure_typecast.where((procedure_typecast['score']<100)&(procedure_typecast['sample']>50)).groupby('measure_id').agg(func.avg('score')) procedures_surveys = procedures_selected.join(procedures_selected,surveys_selected.provider_id==procedures_selected.provider_id) measure_avg_surv = measure_avg.join(procedures_surveys,procedures_surveys.measure_id==measure_avg.measure_id) measure_avg_surv.stat.corr('avg(score)','hcahps_base') #-0.0005003159809636114
from pyspark import SparkConf, SparkContext from pyspark import HiveContext from pyspark.sql.types import DoubleType from pyspark.sql.types import IntegerType from pyspark.sql import functions as func from pyspark.sql.functions import col hive_context = HiveContext(sc) procedure = hive_context.table("default.procedure") hospitals = hive_context.table("default.hospital") procedure_typecast = procedure.withColumn( "score", procedure["score"].cast(DoubleType())).withColumn( "sample", procedure["sample"].cast(IntegerType())).withColumn( "denominator", procedure["denominator"].cast(IntegerType())) procedure_hospital = procedure_typecast.join( hospital, procedure_typecast.provider_id == hospital.provider_id) # subset for those procedures that have a score not higher than 100 and a sample of at least 50 score_avg = procedure_hospital.where((procedure_hospital['score'] <= 100) & ( procedure_hospital['sample'] > 50)).groupby('state').agg(func.avg('score')) # show the 10 best states best_states = score_avg.sort(score_avg['avg(score)'].desc()).show(10)
ignore_list = [ 'entitylogicalname', 'SinkCreatedOn', 'SinkModifiedOn', 'messageid', 'sourcesystem', 'Id' ] g1 = 0 g2 = 0 sqlContext.sql("use dsc60230_gcct_tz_db") for m in sqlContext.tables("dsc60230_gcct_tz_db").select('tableName').where( "tableName not like '%bad%'").where( "tableName not like '%dup%'").where( "tableName not like '%bkp%'").where( "tableName not like '%temptz%'").collect(): k = m.asDict().values()[0].encode('utf-8') v1 = sqlCtx.table(k).where( "to_date(lastupdatedatetime) = date_sub(current_date, 1)") g1_1 = v1.count() g1 = g1 + g1_1 v3 = {} v2 = v1.rdd.map(lambda x: str(x).replace("Row(", "").replace( "u\'", "").replace("\'", "").replace("\\\\", "").replace("\"", "") ).map(lambda z: re.findall(r',([a-zA-Z]+):{', z)).filter( lambda x: len(x) > 0).collect() v3 = set([item for sublist in v2 for item in sublist]) if len(v3) > 0: print "\\nThe table " + str(k) + " seem to have nested data " + str( v3) + "\\n\\n" y1 = v1.groupby("shakey").count().filter("count >1") if y1.count() > 0: y2 = y1.select("shakey").toPandas().to_string().replace("\n", ";") print("\\nTable: " + str(k) + " has " + str(y1.count()) +
from pyspark import SparkConf, SparkContext from pyspark import HiveContext from pyspark.sql.functions import monotonicallyIncreasingId from pyspark.sql.functions import lit from pyspark.sql.functions import StringType # instantiate SparkContext for spark-submit conf = SparkConf().setAppName("transforming effective care and readmissions") sc = SparkContext(conf=conf) # read Hive tables hive_context = HiveContext(sc) measure_effective_care = hive_context.table("default.measure_effective_care") # filter measure_effective_care.registerTempTable("effective_temp") measure_effective_selected = hive_context.sql( " SELECT provider_id, measure_id,score,sample,measure_start,measure_end FROM effective_temp" ) # add a column to flag the data as effective care measure_effective_with_care_type = measure_effective_selected.withColumn( 'care_type', lit("effective").cast(StringType())) # add an empty column for readmission denominators measure_effective_with_care_type_denominator = measure_effective_with_care_type.withColumn( 'denominator', lit(None).cast(StringType())) # read in readmission data measure_readmission = hive_context.table("default.measure_readmission") measure_readmission.registerTempTable("readmission_temp") measure_readmission_selected = hive_context.sql( " SELECT provider_id, measure_id,denominator,score,measure_start,measure_end FROM readmission_temp"
'msdyn_surveyresponse', 'optionsetmetadata', 'phonecall', 'queue', 'queueitem', 'sla', 'slaitem', 'slakpiinstance', 'socialactivity', 'socialprofile', 'statemetadata', 'statusmetadata', 'systemuser', 'task', 'team', 'teammembership', 'territory', 'gcct_marketingprogram' ] nested = '' duplicates = '' data_discrepancy = '' new_fields = '' new_entities = '' #for m in sqlContext.tables("dsc10742_gcctmsd_lz_db").select('tableName').where("tableName not like '%bad%'").where("tableName not like '%dup%'").where("tableName not like '%bkp%'").where("tableName not like '%temptz%'").collect(): #k=m.asDict().values()[0].encode('utf-8') for k in tables_gcct_except_bad_rec: v1 = sqlContext.table(k).filter( "to_date(lastupdatedatetime) = date_sub(CAST(current_timestamp() as DATE), " + str(days_to_go_back) + ")") g1_1 = v1.count() g1 = g1 + g1_1 v3 = {} v2 = v1.select("entitystring").rdd.map(lambda x: x.asDict().values( )[0].encode('utf-8')).map(lambda u: str(u).replace('null', 'None').replace( 'false', 'False').replace('true', 'True')).map( lambda y: ast.literal_eval(ast.literal_eval(y)['entitydata'])).map( lambda u: get_nested_keys(u)).filter( lambda x: len(x) > 0).collect() v3 = set([item.lower() for sublist in v2 for item in sublist]) if len(v3) > 0: print "\\nThe table " + str(k) + " seem to have nested data " + str( v3).replace("set([", "").replace("])", "") + "\\n\\n" nested = 'Nested data is observed'
Json_location="/project/dsc/prod/archive/60263_fsm" sqlContext.sql("use "+db) sqlContext.sql("show tables").show() messages_2_days=commands.getoutput("hadoop fs -ls "+Json_location+" | awk -F' +' -v g=\"{\" -v h=\"}\" -v q=\"'\" -v s=\":\" '{print g q$8q s q$6q h}'").splitlines()[1:] msg_2_days_dict={} for i in messages_2_days: dict_=ast.literal_eval(i) if dict_.keys()[0]!='': msg_2_days_dict.update(dict_) msg_2_days_dict list_ids_in_db=sqlContext.table("audit").select("id").rdd.map(lambda l356: l356["id"]).collect() #list_ids_in_db=[i424["id"] for i424 in list_ids_in_db_row if i424["id"] !=''] #list_ids_in_db list_2_load_keys=list(set(msg_2_days_dict.keys()).difference(set(list_ids_in_db)))[0:2] if len(list_2_load_keys)==0: print("No data to load") exit() list_2_load_str="/* ".join(filter(None, list_2_load_keys))+"/*" list_2_load_str ids_2_load=[[i425,msg_2_days_dict[i425]] for i425 in list_2_load_keys] #ids_2_load=[i400.split(" ") for i400 in list_ids_2_load] R = Row('id', 'date_')
retweeted = retweeted.withColumn("ratio_tweet_retweeted", retweeted["retweeted"]/retweeted["tweets"])\ .orderBy("ratio_tweet_retweeted", ascending=False) retweeted.limit(10).show() # *************** VISUALIZACIÓN CON HIVE ************************ hiveContext = HiveContext(sc) # En este caso, tenemos tablas ya guardadas en Hive.Podemos verlas con el comando: hiveContext.tables().show() # Cargamos la información de la tabla tweets tweets = hiveContext.table("tweets28a") print("\nLos datos cargados incluyen {} tweets\n".format(tweets.count())) tweets.printSchema() # Analizamos los tweets que están geolocalizados hiveContext.sql('DROP TABLE IF EXISTS tweets') hiveContext.registerDataFrameAsTable(tweets, "tweets") tweets_place = hiveContext.sql( "SELECT place.name, COUNT(text) AS tweets FROM tweets WHERE place IS NOT NULL GROUP BY place.name ORDER BY tweets DESC" ) tweets_place.limit(10).show() # Podemos hacer el mismo análisis a través de RDDs
sqlContext.sql("use " + db) #sqlContext.sql("show tables").show() messages_2_days = commands.getoutput( "hadoop fs -ls " + Json_location + " | awk -F' +' -v g=\"{\" -v h=\"}\" -v q=\"'\" -v s=\":\" '{print g q$8q s q$6q h}'" ).splitlines()[1:] msg_2_days_dict = {} for i in messages_2_days: dict_ = ast.literal_eval(i) if dict_.keys()[0] != '': msg_2_days_dict.update(dict_) msg_2_days_dict list_ids_in_db = sqlContext.table("latluri1.fsm_audit").select("id").rdd.map( lambda l356: l356["id"]).collect() #list_ids_in_db=[i424["id"] for i424 in list_ids_in_db_row if i424["id"] !=''] #list_ids_in_db list_2_load_keys = list( set(msg_2_days_dict.keys()).difference(set(list_ids_in_db)))[0:3] if len(list_2_load_keys) == 0: print("No Process ids to load") exit() list_2_load_str = "/* ".join(filter(None, list_2_load_keys)) + "/*" list_2_load_keys_ids = [str(i429).split("/")[-1] for i429 in list_2_load_keys] print("\n\nThe following processids are loaded " + str(list_2_load_keys_ids).replace("[", "").replace("]", "")) ids_2_load = [[i425, msg_2_days_dict[i425]] for i425 in list_2_load_keys]
ignore_list=[col.lower() for col in ignore_list_1] g1=0 g2=0 sqlContext.sql("use dsc60230_gcct_tz_db") tables_gcct_except_bad_rec=['account','activityparty','annotation','appointment','businessunit','calendar','calendarrule','contact','cxlvhlp_chatactivity','cxlvhlp_chatqueuestatistic','cxlvhlp_surveyitem','email','fax','gcct_accountresponsibleagent','gcct_additionalsymptomcodes','gcct_addportalmessage','gcct_arbitrationclaimprocessing','gcct_buybackevaluationmilestones','gcct_caseassignment','gcct_caseclassification','gcct_casedispositiontype','gcct_coachback','gcct_country','gcct_customersatisfactiontools','gcct_delegationofauthority','gcct_demandltrtpsmclaimsprocessing','gcct_doaprogramcode','gcct_documentcustomerrecontact','gcct_engine','gcct_executive','gcct_executiveliaison','gcct_fieldinvolvementassistancerequest','gcct_fmcccontract','gcct_fulfillmentvendor','gcct_genericinformation','gcct_genericinformationtopic','gcct_generictopic','gcct_goodfaithreview','gcct_knownpartsdelay','gcct_lemonlawcriteria','gcct_loanercost','gcct_loyaltyassistance','gcct_loyaltycostdetails','gcct_loyaltyprogramcode','gcct_material','gcct_materialrequest','gcct_materialrequestdetail','gcct_offer','gcct_offerdetail','gcct_onlinegaragevehicles','gcct_partsorderstatus','gcct_partsordersystems','gcct_partssmeassistancerequest','gcct_pcarequest','gcct_priordealerdecision','gcct_qaincident','gcct_qamonitoring','gcct_queuepriority','gcct_ravprocessingmilestones','gcct_reactdata','gcct_reacttransmission','gcct_reasoncode','gcct_recall','gcct_region','gcct_rentaldetail','gcct_slaconfiguration','gcct_sms','gcct_socialmediaconversation','gcct_socialmediamessage','gcct_socialmediaprofile','gcct_specialloanercode','gcct_state','gcct_surveyconfiguration','gcct_symptomclassification','gcct_tasktype','gcct_technicalassistancerequest','gcct_testdriverequest','gcct_timezone','gcct_userdelegationofauthority','gcct_vehicle','gcct_vehiclebrand','gcct_vehicleclass','gcct_vehicleloyaltyallowance','gcct_vehicleoffroad','gcct_vehicleowner','gcct_warrantyhistory','gcct_warrantyloanerpartsdelay','gcct_warrantyloanerrequest','gcct_warrantyloanerrequesthistory','globaloptionsetmetadata','incident','incidentresolution','letter','msdyn_answer','msdyn_question','msdyn_questionresponse','msdyn_survey','msdyn_surveyinvite','msdyn_surveyresponse','optionsetmetadata','phonecall','queue','queueitem','sla','slaitem','slakpiinstance','socialactivity','socialprofile','statemetadata','statusmetadata','systemuser','task','team','teammembership','territory'] nested='' duplicates='' data_discrepancy='' new_fields='' new_entities='' #for m in sqlContext.tables("dsc60230_gcct_tz_db").select('tableName').where("tableName not like '%bad%'").where("tableName not like '%dup%'").where("tableName not like '%bkp%'").where("tableName not like '%temptz%'").collect(): #k=m.asDict().values()[0].encode('utf-8') for k in tables_gcct_except_bad_rec: v1=sqlContext.table(k).filter("to_date(lastupdatedatetime) = date_sub(CAST(current_timestamp() as DATE), 1)") g1_1=v1.count() g1=g1+g1_1 v3={} v2=v1.select("entitystring").rdd.map(lambda x:x.asDict().values()[0].encode('utf-8')).map(lambda u:str(u).replace('null','None').replace('false','False').replace('true','True')).map(lambda y:ast.literal_eval(ast.literal_eval(y)['entitydata'])).map(lambda u: get_nested_keys(u)).filter(lambda x: len(x) > 0).collect() v3=set([item.lower() for sublist in v2 for item in sublist]) if len(v3) >0: print "\\nThe table "+str(k)+" seem to have nested data " +str(v3).replace("set([","").replace("])","")+"\\n\\n" nested='Nested data is observed' y1=v1.groupby("shakey").count().filter("count >1") if y1.count()>0: y2=y1.select("shakey").toPandas().to_string(index=False).replace("\n",";") print("\\nTable: "+str(k)+" has "+str(y1.count())+" duplicate(s)"+str(y2)+"\\n\\n") duplicates='Duplicate(s) are observed' col_list=set([col.lower() for col in v1.columns]) v4=v1.select("entitystring").rdd.map(lambda x:x.asDict().values()[0].encode('utf-8')).map(lambda u:str(u).replace('null','None').replace('false','False').replace('true','True')).map(lambda y:ast.literal_eval(ast.literal_eval(y)['entitydata']).keys()+ast.literal_eval(y).keys()).map(lambda w:[x.lower() for x in w]).map(lambda x: set(x).difference(col_list)).collect()