def getReadRDBMSDataFrame(): from_format = "dd/MMM/yyyy:HH:mm:ss" to_format = 'dd-MM-yyyy' storeIncrementalValue(0, "ignore") df = spark.read.format("csv") \ .option("inferSchema", True) \ .option("header", True) \ .load(argv[17]) df = df.filter(df.createdt.isNotNull() & (df.createdt != "null")) result = df.withColumn("createdt", f.from_unixtime(f.unix_timestamp(f.col("createdt"), from_format), to_format)) \ .withColumn("value",f.col("value").cast(f.StringType()))\ .withColumn("score",f.col("score").cast(f.StringType()))\ .withColumn("regioncode",f.col("regioncode").cast(f.StringType()))\ .withColumn("status",f.col("status").cast(f.StringType()))\ .withColumn("count",f.col("count").cast(f.StringType()))\ .withColumn("statuscode",f.col("statuscode").cast(f.StringType()))\ .select("id", "username", "amount", "ip", "createdt", "value", "score", "regioncode", "status", "method", "key", "count", "type", "site", "statuscode") \ .withColumn("rdbms_current_date", f.current_date()) \ .filter(f.col("id") > getIncrementalValue()) result = result.withColumn( "id", f.col("id").cast(f.StringType())).na.fill("Not_Applicable") max_value = df.agg({"id": "max"}).collect()[0] storeIncrementalValue(max_value["max(id)"], "overwrite") result.printSchema() return result
def token_score(df, on, value): q_val = value df = df.select([on]) df = df.withColumn('query', F.lit(q_val).cast(F.StringType())) # TODO: implement the pattent pattern = ',' df = df.withColumn('tokens1', F.split(F.col('left'), pattern)) df = df.withColumn('tokens2', F.split(F.col('right'), pattern)) # intersection = tokens1.intersection(tokens2) # diff1to2 = tokens1.difference(tokens2) = pure token 1 # diff2to1 = tokens2.difference(tokens1) = pure token 2 # TODO: implement an intersect and a diff method df = df.withColumn('intersection', F.intersect('tokens1', 'tokens2')) df = df.withColumn('diff1to2', F.diff('tokens1', 'tokens2')) df = df.withColumn('diff2to1', F.diff('tokens2', 'tokens1')) # sorted_sect = " ".join(sorted(intersection)) # sorted_1to2 = " ".join(sorted(diff1to2)) # sorted_2to1 = " ".join(sorted(diff2to1)) # TODO: implement a concat for an array df = df.withColumn('sorted_sect', F.concat_ws(' ', F.sort_array('intersection'))) df = df.withColumn('sorted_1to2 ', F.concat_ws(' ', F.sort_array('diff1to2'))) df = df.withColumn('sorted_2to1', F.concat_ws(' ', F.sort_array('diff2to1'))) # combined_1to2 = sorted_sect + " " + sorted_1to2 = chain 1 that has been sorted # combined_2to1 = sorted_sect + " " + sorted_2to1 = chain 2 that has been sorted # TODO: no, i'm joking df = df.withColumn('combined_1to2', F.concat_ws(' ', ['sorted_sect', 'sorted_1to2'])) df = df.withColumn('combined_1to2', F.concat_ws(' ', ['sorted_sect', 'sorted_2to1'])) # strip # sorted_sect = sorted_sect.strip() # combined_1to2 = combined_1to2.strip() # combined_2to1 = combined_2to1.strip() for c in ['sorted_sect', 'combined_1to2', 'combined_2to1']: df = df.withColumn(c, F.trim(c)) # TODO: create a function spark_ratio df = df.withColumn( 'ratio1', spark_ratio(F.col('sorted_sect', F.col('combined_1to2')))) df = df.withColumn( 'ratio2', spark_ratio(F.col('sorted_sect', F.col('combined_2to1')))) df = df.withColumn( 'ratio3', spark_ratio(F.col('combined_2to1', F.col('combined_1to2')))) # pairwise = [ # ratio_func(sorted_sect, combined_1to2), # ratio_func(sorted_sect, combined_2to1), # ratio_func(combined_1to2, combined_2to1) # ] df = df.withColumn('max_ratio', F.max(['ratio1', 'ratio2', 'ratio3'])) df = df.withColumnRenamed('max_ratio', 'token_fuzzy') df = df.select(['token_fuzzy']) return df
def fuzzyspark(df, on, value): q_val = value df = df.select([on]) # TODO: Rework that part df = df.withColumn('query', F.lit(q_val).cast(F.StringType())) df = df.withColumn( 'len', F.min(F.length(on), F.lit(len(q_val)).cast(T.IntegerType()))) df = df.withColumn('levenshtein', F.levenshtein(on, 'query')) df = df.withColumn('score', F.col('levenshtein') / F.col('len')) df = df.select(['score']) return df
def get_prod_table_indexed(prod_table, prod_l_code, prod_l_desc, prod_l_plus_desc, prod_l_cat): """ Fetch product table with the required category level data with the index , prod_l_cat is the new column that is created (passed as a parameter) """ char_replace = F.udf( replace_chars, F.StringType() ) #udf will run row by row on partition dataframe, but normal full will run on dataframe # * will serialise the list , ** will serialise the dictionary, krags,kwargs if prod_l_plus_desc in trans.columns: prod_columns = [prod_l_code, prod_l_desc, prod_l_plus_desc] return prod_table.select(prod_columns).withColumn( prod_l_cat, F.concat( F.lit("_"), char_replace(prod_l_code, F.lit(r"[^A-Za-z0-9]+"), F.lit(r"")), F.lit("_"), char_replace(prod_l_desc, F.lit(r"[^A-Za-z]+"), F.lit(r"")), F.lit("_"), char_replace(prod_l_plus_desc, F.lit(r"[^A-Za-z]+"), F.lit(r"")))).withColumn( "prod_index", F.dense_rank().over( Window.orderBy(prod_l_cat))) else: prod_columns = [prod_l_code, prod_l_desc] return prod_table.select(prod_columns).withColumn( prod_l_cat, F.concat( F.lit("_"), char_replace(prod_l_code, F.lit(r"[^A-Za-z0-9]+"), F.lit(r"")), F.lit("_"), char_replace(prod_l_desc, F.lit(r"[^A-Za-z]+"), F.lit(r"")))).withColumn( "prod_index", F.dense_rank().over( Window.orderBy(prod_l_cat)))
data.cache() # number of loans falling into each target with percentage data.groupBy("TARGET").count().withColumn("Percentage", F.col("count") * 100 / data.count()).show() # number of missing values in each column counts = [(x, data.filter(F.col(x).isNull()).count()) for x in data.columns] counts.sort() # number of columns in each datatype [(name, x, round(x * 100.0 / data.count(), 2)) for name, x in counts if x > 0] print(Counter((x[1] for x in data.dtypes))) # view unique values in all string columns str_col_names = [x.name for x in data.schema.fields if x.dataType == F.StringType()] unique_df = data.agg(*((F.countDistinct(F.col(c))).alias(c) for c in str_col_names)) unique_df.show() # describe days employed data.select('DAYS_EMPLOYED').describe().show() # describe days birth column data = data.withColumn("AGE", F.col("DAYS_BIRTH") / -365) data.select("DAYS_BIRTH", "AGE").describe().show() # dig deep on days employed anom = data.filter(F.col('DAYS_EMPLOYED') == 365243) non_anom = data.filter(F.col('DAYS_EMPLOYED') != 365243)
def main(args): # Create a directory to save outputs if args.dst.exists(): logger.info(f'Found an existing destination folder. Deleting...') shutil.rmtree(args.dst, ignore_errors=True) # Create Spark Session if args.spark_driver_mem is not None: driver_mem = f'{args.spark_driver_mem}g' else: driver_mem = '{0}g'.format(int(psutil.virtual_memory().total // 1e9)) spark = (SparkSession.builder.appName(__name__).config( 'spark.driver.memory', driver_mem).getOrCreate()) # Build schema schema = t.StructType([ t.StructField('absolute_url', t.StringType()), t.StructField('author', t.StringType()), t.StructField('author_str', t.StringType()), t.StructField('cluster', t.StringType()), t.StructField('date_created', t.DateType()), t.StructField('date_modified', t.DateType()), t.StructField('download_url', t.StringType()), t.StructField('extracted_by_ocr', t.BooleanType()), t.StructField('html', t.StringType()), t.StructField('html_columbia', t.StringType()), t.StructField('html_lawbox', t.StringType()), t.StructField('html_with_citations', t.StringType()), t.StructField('id', t.LongType()), t.StructField('joined_by', t.ArrayType(t.StringType())), t.StructField('local_path', t.StringType()), t.StructField('opinions_cited', t.ArrayType(t.StringType())), t.StructField('page_count', t.IntegerType()), t.StructField('per_curiam', t.BooleanType()), t.StructField('plain_text', t.BooleanType()), t.StructField('resource_uri', t.BooleanType()), t.StructField('sha1', t.StringType()), t.StructField('type', t.StringType()), ]) # Find all json files json_df = spark.read.json(str(args.src), schema=schema, multiLine=True) logger.info(f'Processing {json_df.count()} JSON files...') # Create UDF to parse html markups global parse_html parse_html = partial(parse_html, concat_str=args.concat_str) parse_html_udf = f.udf(parse_html, f.StringType()) # Concatenating columns logger.info(f"Extracting text columns ({', '.join(args.text_fields)}) and " f"concatenating with '{args.concat_str}'...") CAT_COL = 'concat' # First concat text columns in a row (i.e. JSON file) # Then, concat with all rows together texts_df = (json_df.select(*args.text_fields).withColumn( CAT_COL, f.concat_ws(' ', *args.text_fields)).withColumn( CAT_COL, parse_html_udf(CAT_COL)).agg( f.collect_list(CAT_COL).alias(CAT_COL)).withColumn( CAT_COL, f.concat_ws(' ', CAT_COL))) # Write file logger.info(f'Writing text to {args.dst}') texts_df.write.text(str(args.dst)) logger.info(f'Text successfully saved to {args.dst}')
if __name__ == "__main__": conf = SparkConf().setAppName('Twitter Parse') sc = SparkContext(conf=conf).getOrCreate() assert sc.version >= '2.3' # make sure we have Spark 2.3+ sqlContext = SQLContext(sc) spark = SparkSession.builder.appName('Twitter Parse').getOrCreate() spark.sparkContext.setLogLevel('WARN') sc.setLogLevel('WARN') def remove_diacritics(s): return unidecode.unidecode(s) rem_udf = functions.udf(remove_diacritics, functions.StringType()) #er = EntityResolution(os.path.join(bdenv_loc,'twitter_parse_2018_01.json'), "stopwords.txt") #er.preprocessDF() for i in range(1,7): er = EntityResolution(os.path.join(bdenv_loc,'twitter_parse_2018_0{}.json'.format(i)), "stopwords.txt") er.preprocessDF()
"dendogram_index_list_nsovr1.xlsx" ] temp_time = time() # prod_l_code, prod_l_desc, prod_l_cat = get_prod_level(prod_cat_level) prod_l_code = "product_code" prod_l_desc = "ns_harm" prod_l_cat = "ns_harm" #variable that will have values concat of code and desc print prod_l_code print prod_l_desc print prod_l_cat user = get_user() print "user: "******"""select * from x5_ru_analysis.kg_SM_karusel_100per2_52weeks # where item_spend>0 and item_qty >0""") #trans = trans.withColumnRenamed("category_id","generaltype_id") # trans = trans.where(F.col("store_format")=="TRADITIONAL_TRADE") # Select distinct product_code, dunn_cat_english_1, group_code # from x5_ru_analysis.ak_prod_karusel_2 # prod_table = sqlContext.sql(""" # SELECT DISTINCT PRODUCT_CODE,NS_HARM,NS_HARM AS GROUP_CODE FROM (SELECT A.*,CONCAT('_',GROUP_CODE,'_',DUNN_CAT_ENGLISH) AS NEW_CAT # FROM x5_ru_analysis.ak_prod_karusel_2 A) A INNER JOIN # X5_RU_ANALYSIS.AK_NS_MAP_KARUSEL1 B ON A.NEW_CAT = B.KARU_CATEGORY_FINAL # """) # prod_table = prod_table.select([prod_l_code,prod_l_desc,"group_code"]).withColumn(
sc = SparkContext() ss = SparkSession.builder.master("local").getOrCreate() rows = sc.textFile("/logs_nasa/NASA_access_log_Jul95").map(parse_interaction) null_row = lambda row: row is not None rows = rows.filter(null_row) df = ss.createDataFrame(rows) # task1 df_errors = df[df.return_code.like("5%")] df_errors_length = df_errors.groupBy(["host", "request"]).count() df_errors_length.coalesce(1).write.csv("task2.csv", mode="append") #task2 df_requests_fields_combs = df.groupBy(["date", "request_method", "return_code"]).count() df_requests_fields_combs.coalesce(1).write.csv("task3.csv") #task3 days = lambda i: i * 86400 df_codes = df[(df.return_code.like("5%") | df.return_code.like("4%"))] df_codes_grouped = df_codes.groupBy(["return_code", "date"]).count() udf_myFunction = functions.udf(convert_date, functions.StringType()) df_codes_grouped = df_codes_grouped.withColumn("date", udf_myFunction("date")) df_codes_grouped = df_codes_grouped.withColumn("date", df_codes_grouped.date.cast("timestamp")) w = (Window.orderBy(functions.col("date").cast("long")).rangeBetween(-days(7), 0)) df_codes_grouped = df_codes_grouped.withColumn("rolling_average", functions.avg("count").over(w)) df_codes_grouped.coalesce(1).write.csv("task4.csv")
spark = (SparkSession.builder.appName("US departure flight data").config( "spark.driver.memory", "8g").getOrCreate()) # %% def to_date_format_udf(d_str: str): l = [char for char in d_str] return "".join(l[0:2]) + "/" + "".join(l[2:4]) + " " + " " + "".join( l[4:6]) + ":" + "".join(l[6:]) to_date_format_udf("02190925") # %% spark.udf.register("to_date_format_udf", to_date_format_udf, T.StringType()) # %% df = (spark.read.format("csv").schema( "date STRING, delay INT, distance INT, origin STRING, destination STRING"). option("header", "true").option( "path", os.path.join(DATA_DIRECTORY, "flights", "departuredelays.csv")).load()) # %% df.show(5, False) # %% df.selectExpr("to_date_format_udf(date) as data_format").show(10, truncate=False)
def add_zip5_col(dataframe, column_name="zipcode"): get_zip5_udf = F.udf(get_zip5, F.StringType()) return dataframe.withColumn("zip5", get_zip5_udf(dataframe[column_name]))