for i in a.keys(): b = a[i] if "{" in str(b): key_list = key_list + [i] return (key_list) ignore_list_1 = [ 'entitylogicalname', 'SinkCreatedOn', 'SinkModifiedOn', 'messageid', 'sourcesystem', 'Id', 'entitydata' ] ignore_list = [col.lower() for col in ignore_list_1] sqlContext.sql("use dsc60263_fsm_tz_db") for m in sqlContext.tables("dsc60263_fsm_tz_db").select('tableName').where( "tableName not like '%dup%'").where( "tableName not like '%bkp%'").where( "tableName not like '%temptz%'").collect(): k = m.asDict().values()[0].encode('utf-8') v1 = sqlContext.table(k).filter( "to_date(lastupdatedatetime) = date_sub(CAST(current_timestamp() as DATE), 1)" ) y1 = v1.select("message__id").distinct() if y1.count() > 0: v2 = list(v1.select("message__id").toPandas()["message__id"]) for msg_id in v2: print(str(msg_id).strip() + "\\n") sys.exit(0)
retweeted = users_agg_new.join(user_retweets, users_agg_new["screen_name"] == user_retweets["screen_name"])\ .orderBy("retweeted", ascending=False) retweeted = retweeted.withColumn("ratio_tweet_retweeted", retweeted["retweeted"]/retweeted["tweets"])\ .orderBy("ratio_tweet_retweeted", ascending=False) retweeted.limit(10).show() # *************** VISUALIZACIÓN CON HIVE ************************ hiveContext = HiveContext(sc) # En este caso, tenemos tablas ya guardadas en Hive.Podemos verlas con el comando: hiveContext.tables().show() # Cargamos la información de la tabla tweets tweets = hiveContext.table("tweets28a") print("\nLos datos cargados incluyen {} tweets\n".format(tweets.count())) tweets.printSchema() # Analizamos los tweets que están geolocalizados hiveContext.sql('DROP TABLE IF EXISTS tweets') hiveContext.registerDataFrameAsTable(tweets, "tweets") tweets_place = hiveContext.sql( "SELECT place.name, COUNT(text) AS tweets FROM tweets WHERE place IS NOT NULL GROUP BY place.name ORDER BY tweets DESC" )
Raw_jsons=df.count() df=df.sort("entitylogicalname") df=df.withColumn("Process__id", lit(str(uuid.uuid1()))) df=df.withColumn("gdia_load_date", lit(str(today_full))) df=df.withColumnRenamed('messageid','message__id') bad_records=sqlContext.createDataFrame(Row(**x) for x in [{"entitystring":"","process__id":"","gdia_load_date":"","reason":""}]).where("entitystring is null") duplicate_records=sqlContext.createDataFrame(Row(**x) for x in [{"entitystring":"","process__id":"","gdia_load_date":"","reason":"","sha_key":""}]).where("entitystring is null") df,bad_records=update_df(from_df=df,dump_df=bad_records,deciding_col_rename="entitystring",reason="JSON parsing issues audit columns",list_col_bad_rec=["entitystring","process__id","gdia_load_date","reason"],deciding_col="_corrupt_record") df=df.withColumn("entitystring", to_json(struct(df.columns))) tables_2_load=df.select("entitylogicalname").distinct().rdd.map(lambda l494: l494["entitylogicalname"]).filter(lambda l509: l509 is not None).collect() sqlContext.sql("use "+db) tables_in_db=sqlContext.tables(db).select("tableName").rdd.map(lambda l321: l321["tableName"]).collect() #tables_in_db for i344_1 in tables_2_load: #i344_1="gcct_genericInformationtopic" #if i344_1=="gcct_genericInformationtopic": i344=i344_1.lower() #print(i344) final=extract_nested_json(source_dataframe=df,table=i344,col_table_name="entitylogicalname",col_nested_data="entitydata",cols_to_add_hash="eventtype") final,bad_records=update_df(from_df=final,dump_df=bad_records,deciding_col_rename="temp_col",reason="JSON parsing issue entity data",list_col_bad_rec=["entitystring","process__id","gdia_load_date","reason"],deciding_col="_corrupt_record_data") final=final.drop('entitydata') if final.count()>0: for i1004 in final.columns: final=final.withColumn(i1004, when(final[i1004]=='None',lit(unicode(''))).otherwise(final[i1004])) final=final.select('*', row_number().over(Window.partitionBy('sha_key').orderBy(lit('sha_key'))).alias('dupeCount')).withColumn('dupeCount', when(col('dupeCount') == 1, lit(None)).otherwise(lit("True"))) ################################### prints "duplicates within the batch all the time"
sha_columns = df3_2.columns sha_columns.remove(index_col) if type(cols_to_add_hash) == list: sha_columns.extend(cols_to_add_hash) else: sha_columns.append(col_nested_data) #print(sha_columns) final = final.withColumn("sha_key", sha2(concat_ws("||", *sha_columns), 256)) final = final.withColumn("sha_key2", concat_ws("||", *sha_columns)) return (final) sqlContext.clearCache() sqlContext.sql("use " + db) tables_in_db = sqlContext.tables(db).select("tableName").rdd.map( lambda l321: l321["tableName"]).collect() #reads a json, if it is not readable, then all the json is captured in _corrupt_record column, if it is null=>no errors for that message df_1 = sqlContext.read.json(db_location + db + "/raw/JSON_INPUT").cache() #df_1=sqlContext.read.json("JSON_INPUT_init").cache() Raw_jsons = df_1.count() if Raw_jsons == 0: print("\n\nNo data in the process ids") ids_2_load_df.write.mode("append").insertInto("latluri1.fsm_audit") exit() df_1 = df_1.sort("entitylogicalname") df = df_1.withColumn( "columnindex", row_number().over(Window().partitionBy(lit("A")).orderBy( lit('A')))).cache()