for i in a.keys():
        b = a[i]
        if "{" in str(b):
            key_list = key_list + [i]
    return (key_list)


ignore_list_1 = [
    'entitylogicalname', 'SinkCreatedOn', 'SinkModifiedOn', 'messageid',
    'sourcesystem', 'Id', 'entitydata'
]
ignore_list = [col.lower() for col in ignore_list_1]

sqlContext.sql("use dsc60263_fsm_tz_db")

for m in sqlContext.tables("dsc60263_fsm_tz_db").select('tableName').where(
        "tableName not like '%dup%'").where(
            "tableName not like '%bkp%'").where(
                "tableName not like '%temptz%'").collect():
    k = m.asDict().values()[0].encode('utf-8')
    v1 = sqlContext.table(k).filter(
        "to_date(lastupdatedatetime)  = date_sub(CAST(current_timestamp() as DATE), 1)"
    )
    y1 = v1.select("message__id").distinct()
    if y1.count() > 0:
        v2 = list(v1.select("message__id").toPandas()["message__id"])
        for msg_id in v2:
            print(str(msg_id).strip() + "\\n")

sys.exit(0)
Esempio n. 2
0
retweeted = users_agg_new.join(user_retweets, users_agg_new["screen_name"] == user_retweets["screen_name"])\
                         .orderBy("retweeted", ascending=False)

retweeted = retweeted.withColumn("ratio_tweet_retweeted", retweeted["retweeted"]/retweeted["tweets"])\
                    .orderBy("ratio_tweet_retweeted", ascending=False)

retweeted.limit(10).show()

# *************** VISUALIZACIÓN CON HIVE ************************

hiveContext = HiveContext(sc)

# En este caso, tenemos tablas ya guardadas en Hive.Podemos verlas con el comando:

hiveContext.tables().show()

# Cargamos la información de la tabla tweets

tweets = hiveContext.table("tweets28a")
print("\nLos datos cargados incluyen {} tweets\n".format(tweets.count()))

tweets.printSchema()

# Analizamos los tweets que están geolocalizados

hiveContext.sql('DROP TABLE IF EXISTS tweets')
hiveContext.registerDataFrameAsTable(tweets, "tweets")
tweets_place = hiveContext.sql(
    "SELECT place.name, COUNT(text) AS tweets FROM tweets WHERE place IS NOT NULL GROUP BY place.name ORDER BY tweets DESC"
)
Esempio n. 3
0
Raw_jsons=df.count()
df=df.sort("entitylogicalname")
df=df.withColumn("Process__id", lit(str(uuid.uuid1())))
df=df.withColumn("gdia_load_date", lit(str(today_full)))
df=df.withColumnRenamed('messageid','message__id')

bad_records=sqlContext.createDataFrame(Row(**x) for x in [{"entitystring":"","process__id":"","gdia_load_date":"","reason":""}]).where("entitystring is null")
duplicate_records=sqlContext.createDataFrame(Row(**x) for x in [{"entitystring":"","process__id":"","gdia_load_date":"","reason":"","sha_key":""}]).where("entitystring is null")

df,bad_records=update_df(from_df=df,dump_df=bad_records,deciding_col_rename="entitystring",reason="JSON parsing issues audit columns",list_col_bad_rec=["entitystring","process__id","gdia_load_date","reason"],deciding_col="_corrupt_record")
df=df.withColumn("entitystring", to_json(struct(df.columns)))
tables_2_load=df.select("entitylogicalname").distinct().rdd.map(lambda l494: l494["entitylogicalname"]).filter(lambda l509: l509 is not None).collect()


sqlContext.sql("use "+db)
tables_in_db=sqlContext.tables(db).select("tableName").rdd.map(lambda l321: l321["tableName"]).collect()
#tables_in_db

for i344_1 in tables_2_load:
#i344_1="gcct_genericInformationtopic"
#if i344_1=="gcct_genericInformationtopic":
    i344=i344_1.lower()
    #print(i344)
    final=extract_nested_json(source_dataframe=df,table=i344,col_table_name="entitylogicalname",col_nested_data="entitydata",cols_to_add_hash="eventtype")
    final,bad_records=update_df(from_df=final,dump_df=bad_records,deciding_col_rename="temp_col",reason="JSON parsing issue entity data",list_col_bad_rec=["entitystring","process__id","gdia_load_date","reason"],deciding_col="_corrupt_record_data")
    final=final.drop('entitydata')
    if final.count()>0:
        for i1004 in final.columns:
            final=final.withColumn(i1004, when(final[i1004]=='None',lit(unicode(''))).otherwise(final[i1004]))
        final=final.select('*', row_number().over(Window.partitionBy('sha_key').orderBy(lit('sha_key'))).alias('dupeCount')).withColumn('dupeCount', when(col('dupeCount') == 1, lit(None)).otherwise(lit("True")))
################################### prints "duplicates within the batch all the time"    
Esempio n. 4
0
    sha_columns = df3_2.columns
    sha_columns.remove(index_col)
    if type(cols_to_add_hash) == list:
        sha_columns.extend(cols_to_add_hash)
    else:
        sha_columns.append(col_nested_data)
    #print(sha_columns)
    final = final.withColumn("sha_key", sha2(concat_ws("||", *sha_columns),
                                             256))
    final = final.withColumn("sha_key2", concat_ws("||", *sha_columns))
    return (final)


sqlContext.clearCache()
sqlContext.sql("use " + db)
tables_in_db = sqlContext.tables(db).select("tableName").rdd.map(
    lambda l321: l321["tableName"]).collect()

#reads a json, if it is not readable, then all the json is captured in _corrupt_record column, if it is null=>no errors for that message
df_1 = sqlContext.read.json(db_location + db + "/raw/JSON_INPUT").cache()
#df_1=sqlContext.read.json("JSON_INPUT_init").cache()
Raw_jsons = df_1.count()
if Raw_jsons == 0:
    print("\n\nNo data in the process ids")
    ids_2_load_df.write.mode("append").insertInto("latluri1.fsm_audit")
    exit()

df_1 = df_1.sort("entitylogicalname")
df = df_1.withColumn(
    "columnindex",
    row_number().over(Window().partitionBy(lit("A")).orderBy(
        lit('A')))).cache()