def generateExpr(columnName, listIntervals): if (len(listIntervals) == 1): return when(col(columnName).between(listIntervals[0][0], listIntervals[0][1]), 0).otherwise(None) else: return (when((col(columnName) >= listIntervals[0][0]) & (col(columnName) < listIntervals[0][1]), len(listIntervals) - 1) .otherwise(generateExpr(columnName, listIntervals[1:])))
def data(self): from pyspark.sql.functions import array, explode, col, lit return self.spark.range(10).toDF('id') \ .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ .withColumn("v", explode(col('vs'))) \ .drop('vs') \ .withColumn('w', lit(1.0))
def test_mixed_sql_and_udf(self): df = self.data w = self.unbounded_window ow = self.ordered_window max_udf = self.pandas_agg_max_udf min_udf = self.pandas_agg_min_udf result1 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min_udf(df['v']).over(w)) expected1 = df.withColumn('v_diff', max(df['v']).over(w) - min(df['v']).over(w)) # Test mixing sql window function and window udf in the same expression result2 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min(df['v']).over(w)) expected2 = expected1 # Test chaining sql aggregate function and udf result3 = df.withColumn('max_v', max_udf(df['v']).over(w)) \ .withColumn('min_v', min(df['v']).over(w)) \ .withColumn('v_diff', col('max_v') - col('min_v')) \ .drop('max_v', 'min_v') expected3 = expected1 # Test mixing sql window function and udf result4 = df.withColumn('max_v', max_udf(df['v']).over(w)) \ .withColumn('rank', rank().over(ow)) expected4 = df.withColumn('max_v', max(df['v']).over(w)) \ .withColumn('rank', rank().over(ow)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
def test_vectorized_udf_string_in_udf(self): import pandas as pd df = self.spark.range(10) str_f = pandas_udf(lambda x: pd.Series(map(str, x)), StringType()) actual = df.select(str_f(col('id'))) expected = df.select(col('id').cast('string')) self.assertEquals(expected.collect(), actual.collect())
def setup_method(self, method): sparkConf = create_spark_conf().setMaster("local[4]")\ .setAppName("test wide and deep") self.sc = init_nncontext(sparkConf) self.sqlContext = SQLContext(self.sc) data_path = os.path.join(os.path.split(__file__)[0], "../../resources/recommender") categorical_gender_udf = udf(lambda gender: categorical_from_vocab_list(gender, ["F", "M"], start=1)) bucket_udf = udf(lambda feature1, feature2: hash_bucket(str(feature1) + "_" + str(feature2), bucket_size=100)) self.data_in = self.sqlContext.read.parquet(data_path) \ .withColumn("gender", categorical_gender_udf(col("gender")).cast("int")) \ .withColumn("occupation-gender", bucket_udf(col("occupation"), col("gender")).cast("int")) self.column_info = ColumnFeatureInfo( wide_base_cols=["occupation", "gender"], wide_base_dims=[21, 3], wide_cross_cols=["occupation-gender"], wide_cross_dims=[100], indicator_cols=["occupation", "gender"], indicator_dims=[21, 3], embed_cols=["userId", "itemId"], embed_in_dims=[100, 100], embed_out_dims=[20, 20], continuous_cols=["age"])
def test_basic(self): df = self.data weighted_mean_udf = self.pandas_agg_weighted_mean_udf # Groupby one column and aggregate one UDF with literal result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id') expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id') self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) # Groupby one expression and aggregate one UDF with literal result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\ .sort(df.id + 1) expected2 = df.groupby((col('id') + 1))\ .agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort(df.id + 1) self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) # Groupby one column and aggregate one UDF without literal result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id') expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, w)')).sort('id') self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) # Groupby one expression and aggregate one UDF without literal result4 = df.groupby((col('id') + 1).alias('id'))\ .agg(weighted_mean_udf(df.v, df.w))\ .sort('id') expected4 = df.groupby((col('id') + 1).alias('id'))\ .agg(mean(df.v).alias('weighted_mean(v, w)'))\ .sort('id') self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
def test_column_getitem(self): from pyspark.sql.functions import col self.assertIsInstance(col("foo")[1:3], Column) self.assertIsInstance(col("foo")[0], Column) self.assertIsInstance(col("foo")["bar"], Column) self.assertRaises(ValueError, lambda: col("foo")[0:10:2])
def scalar_pandas_udf_example(spark): # $example on:scalar_pandas_udf$ import pandas as pd from pyspark.sql.functions import col, pandas_udf from pyspark.sql.types import LongType # Declare the function and create the UDF def multiply_func(a, b): return a * b multiply = pandas_udf(multiply_func, returnType=LongType()) # The function for a pandas_udf should be able to execute with local Pandas data x = pd.Series([1, 2, 3]) print(multiply_func(x, x)) # 0 1 # 1 4 # 2 9 # dtype: int64 # Create a Spark DataFrame, 'spark' is an existing SparkSession df = spark.createDataFrame(pd.DataFrame(x, columns=["x"])) # Execute function as a Spark vectorized UDF df.select(multiply(col("x"), col("x"))).show()
def test_smvPlusDateTime(self): df = self.createDF("t:Timestamp[yyyyMMdd]", "19760131;20120229") r1 = df.select(col("t").smvPlusDays(-10).alias("ts")) r2 = df.select(col("t").smvPlusMonths(1).alias("ts")) r3 = df.select(col("t").smvPlusWeeks(3).alias("ts")) r4 = df.select(col("t").smvPlusYears(2).alias("ts")) r5 = df.select(col("t").smvPlusYears(4).alias("ts")) s = "ts: Timestamp[yyyy-MM-dd hh:mm:ss.S]" e1 = self.createDF( s, "1976-01-21 00:00:00.0;" + "2012-02-19 00:00:00.0") e2 = self.createDF( s, "1976-02-29 00:00:00.0;" + "2012-03-29 00:00:00.0") e3 = self.createDF( s, "1976-02-21 00:00:00.0;" + "2012-03-21 00:00:00.0") e4 = self.createDF( s, "1978-01-31 00:00:00.0;" + "2014-02-28 00:00:00.0") e5 = self.createDF( s, "1980-01-31 00:00:00.0;" + "2016-02-29 00:00:00.0") self.should_be_same(e1, r1) self.should_be_same(e2, r2) self.should_be_same(e3, r3) self.should_be_same(e4, r4) self.should_be_same(e5, r5)
def test_vectorized_udf_dates(self): schema = StructType().add("idx", LongType()).add("date", DateType()) data = [(0, date(1969, 1, 1),), (1, date(2012, 2, 2),), (2, None,), (3, date(2100, 4, 4),), (4, date(2262, 4, 12),)] df = self.spark.createDataFrame(data, schema=schema) date_copy = pandas_udf(lambda t: t, returnType=DateType()) df = df.withColumn("date_copy", date_copy(col("date"))) @pandas_udf(returnType=StringType()) def check_data(idx, date, date_copy): msgs = [] is_equal = date.isnull() for i in range(len(idx)): if (is_equal[i] and data[idx[i]][1] is None) or \ date[i] == data[idx[i]][1]: msgs.append(None) else: msgs.append( "date values are not equal (date='%s': data[%d][1]='%s')" % (date[i], idx[i], data[idx[i]][1])) return pd.Series(msgs) result = df.withColumn("check_data", check_data(col("idx"), col("date"), col("date_copy"))).collect() self.assertEquals(len(data), len(result)) for i in range(len(result)): self.assertEquals(data[i][1], result[i][1]) # "date" col self.assertEquals(data[i][1], result[i][2]) # "date_copy" col self.assertIsNone(result[i][3]) # "check_data" col
def test_udf_with_filter_function(self): df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) from pyspark.sql.functions import udf, col from pyspark.sql.types import BooleanType my_filter = udf(lambda a: a < 2, BooleanType()) sel = df.select(col("key"), col("value")).filter((my_filter(col("key"))) & (df.value < "2")) self.assertEqual(sel.collect(), [Row(key=1, value='1')])
def get_latest_data(self): from pyspark.sql import SparkSession import config import pandas as pd # initialise sparkContext spark1 = SparkSession.builder \ .master(config.sp_master) \ .appName(config.sp_appname) \ .config('spark.executor.memory', config.sp_memory) \ .config("spark.cores.max", config.sp_cores) \ .getOrCreate() sc = spark1.sparkContext # using SQLContext to read parquet file from pyspark.sql import SQLContext sqlContext = SQLContext(sc) from datetime import datetime t1 = datetime.now() df = sqlContext.read.parquet(config.proj_path+'/datas/appid_datapoint_parquet1') df2 = sqlContext.read.parquet(config.proj_path+'/datas/appid_attribute_parquet') df2 = df2[['attribute_id','source','target_address','location']] #renaming the column from pyspark.sql.functions import col df2 = df2.select(col("attribute_id").alias("target_attribute_id"), col("source").alias("source_y"), col("target_address").alias("target_address_y"), col("location").alias("location"), ) # merging the dfs df_merge = df.join(df2,how='left',on='target_attribute_id') # Needed data extraction t1 = datetime.now() data = df_merge.registerTempTable('dummy') data = sqlContext.sql('select sum(byte_count) as byte_count_sum , time_stamp, location from dummy group by location, time_stamp') data = data[data.byte_count_sum > 0] # data cleaning self.p7_df=data.toPandas() t2 =datetime.now() time_to_fetch = str(t2-t1) self.p7_df['bw'] = self.p7_df['byte_count_sum']/(8*3600) self.p7_df = self.p7_df.sort_values(by='location',ascending=True) dates_outlook = pd.to_datetime(pd.Series(self.p7_df.time_stamp),unit='ms') self.p7_df.index = dates_outlook self.p7_df['date'] = self.p7_df.index.date self.p7_df = self.p7_df.sort_values(by='time_stamp') t2 =datetime.now() time_to_fetch = str(t2-t1)
def test_cast_to_string_with_udt(self): from pyspark.sql.functions import col row = (ExamplePoint(1.0, 2.0), PythonOnlyPoint(3.0, 4.0)) schema = StructType([StructField("point", ExamplePointUDT(), False), StructField("pypoint", PythonOnlyUDT(), False)]) df = self.spark.createDataFrame([row], schema) result = df.select(col('point').cast('string'), col('pypoint').cast('string')).head() self.assertEqual(result, Row(point=u'(1.0, 2.0)', pypoint=u'[3.0, 4.0]'))
def test_smvRenameField_preserve_meta_for_unrenamed_fields(self): df = self.createDF("a:Integer; b:String", "1,abc;1,def;2,ghij") desc = "c description" res1 = df.groupBy(col("a")).agg(count(col("a")).alias("c"))\ .smvDesc(("c", desc)) self.assertEqual(res1.smvGetDesc(), [("a", ""), ("c", desc)]) res2 = res1.smvRenameField(("a", "d")) self.assertEqual(res2.smvGetDesc(), [("d", ""), ("c", desc)])
def test_smvDayMonth70(self): df = self.createDF("t:Timestamp[yyyyMMdd]", "19760131;20120229") r1 = df.select(col("t").smvDay70().alias("t_day70")) r2 = df.select(col("t").smvMonth70().alias("t_month70")) e1 = self.createDF("t_day70: Integer", "2221;15399") e2 = self.createDF("t_month70: Integer", "72;505") self.should_be_same(e1, r1) self.should_be_same(e2, r2)
def create_hist_data(df, column, minim, maxim, bins=10): def create_all_conditions(current_col, column, left_edges, count=1): """ Recursive function that exploits the ability to call the Spark SQL Column method .when() in a recursive way. """ left_edges = left_edges[:] if len(left_edges) == 0: return current_col if len(left_edges) == 1: next_col = current_col.when(col(column) >= float(left_edges[0]), count) left_edges.pop(0) return create_all_conditions(next_col, column, left_edges[:], count+1) next_col = current_col.when((float(left_edges[0]) <= col(column)) & (col(column) < float(left_edges[1])), count) left_edges.pop(0) return create_all_conditions(next_col, column, left_edges[:], count+1) num_range = maxim - minim bin_width = num_range / float(bins) left_edges = [minim] for _bin in range(bins): left_edges = left_edges + [left_edges[-1] + bin_width] left_edges.pop() expression_col = when((float(left_edges[0]) <= col(column)) & (col(column) < float(left_edges[1])), 0) left_edges_copy = left_edges[:] left_edges_copy.pop(0) bin_data = (df.select(col(column)) .na.drop() .select(col(column), create_all_conditions(expression_col, column, left_edges_copy ).alias("bin_id") ) .groupBy("bin_id").count() ).toPandas() # If no data goes into one bin, it won't # appear in bin_data; so we should fill # in the blanks: bin_data.index = bin_data["bin_id"] new_index = list(range(bins)) bin_data = bin_data.reindex(new_index) bin_data["bin_id"] = bin_data.index bin_data = bin_data.fillna(0) # We add the left edges and bin width: bin_data["left_edge"] = left_edges bin_data["width"] = bin_width return bin_data
def spark_timestamp_split( data, ratio=0.75, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Spark timestamp based splitter The splitter splits the data into sets by timestamps without stratification on either user or item. The ratios are applied on the timestamp column which is divided accordingly into several partitions. Args: data (spark.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two sets and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. Earlier indexed splits will have earlier times (e.g the latest time in split[0] <= the earliest time in split[1]) col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Float number represented in seconds since Epoch. Returns: list: Splits of the input data as spark.DataFrame. """ multi_split, ratio = process_split_ratio(ratio) ratio = ratio if multi_split else [ratio, 1 - ratio] ratio_index = np.cumsum(ratio) window_spec = Window.orderBy(col(col_timestamp)) rating = data.withColumn("rank", row_number().over(window_spec)) data_count = rating.count() rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count) splits = [] for i, _ in enumerate(ratio_index): if i == 0: rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop( "rank" ) else: rating_split = rating_rank.filter( (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) ).drop("rank") splits.append(rating_split) return splits
def test_string_functions(self): from pyspark.sql.functions import col, lit df = self.spark.createDataFrame([['nick']], schema=['name']) self.assertRaisesRegexp( TypeError, "must be the same type", lambda: df.select(col('name').substr(0, lit(1)))) if sys.version_info.major == 2: self.assertRaises( TypeError, lambda: df.select(col('name').substr(long(0), long(1))))
def test_self_join_with_pandas(self): @pandas_udf('key long, col string', PandasUDFType.GROUPED_MAP) def dummy_pandas_udf(df): return df[['key', 'col']] df = self.spark.createDataFrame([Row(key=1, col='A'), Row(key=1, col='B'), Row(key=2, col='C')]) df_with_pandas = df.groupBy('key').apply(dummy_pandas_udf) # this was throwing an AnalysisException before SPARK-24208 res = df_with_pandas.alias('temp0').join(df_with_pandas.alias('temp1'), col('temp0.key') == col('temp1.key')) self.assertEquals(res.count(), 5)
def test_smvTimestampToStr(self): df = self.createDF("ts:Timestamp[yyyyMMdd'T'HHmmssZ];tz:String", "20180428T025800+1000,+0000;,America/Los_Angeles;20180428T025800+1000,Australia/Sydney") # Use `Z`(RFC 822 time zone) in the SimpleDateFormat because it has only a single valid way to represent a given offset. # Avoid to use `z`(General Time Zone) because it may have different result in different platforms(e.g. UTC and +00:00). # Details in https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html r1 = df.select(col("ts").smvTimestampToStr("+10:00","yyyyMMdd:HHmmssZ").alias("localDT")) r2 = df.select(col("ts").smvTimestampToStr(col("tz"),"yyyy-MM-dd HH:mm:ssZ").alias("localDT2")) e1 = self.createDF("localDT: String", "20180428:025800+1000;;20180428:025800+1000") e2 = self.createDF("localDT2: String", "2018-04-27 16:58:00+0000;;2018-04-28 02:58:00+1000") self.should_be_same(e1, r1) self.should_be_same(e2, r2)
def create_tag_frequencies(self, dataframe): """Produces a PySpark dataframe containing a column representing the total frequency of the tags by record. The frequency of tags is determined by their proportion of the total number of tags in the dataframe. :param dataframe: the PySpark dataframe :returns: the PySpark dataframe containing the tag frequency field and all fields in the supplied dataframe """ df_tags = dataframe.selectExpr("tag1 AS tag").union(dataframe.selectExpr("tag2 AS tag")).union(dataframe.selectExpr("tag3 AS tag")) \ .union(dataframe.selectExpr("tag4 AS tag")).union(dataframe.selectExpr("tag5 AS tag")) df_tags = df_tags.na.drop(subset=["tag"]) tags_total_count = df_tags.count() print("Total number of tags used, including duplicates:",tags_total_count) df_tag_freq = df_tags.groupBy("tag").count().orderBy(desc("count")) df_tag_freq = df_tag_freq.withColumn("frequency", col("count")/tags_total_count) df_tag_freq.orderBy(desc("frequency")).show(10) def one_hot_encode_top_n_tags(dataframe,n): """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present. :param dataframe: the PySpark dataframe :param n: the number of the top ranked tags to return as tag fields :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe """ top_n = [t.tag for t in df_tag_freq.orderBy(desc("frequency")).select("tag").limit(n).collect()] for tag in top_n: # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names tag_column_name = ("tag_"+tag).replace(".","dot") dataframe = dataframe.withColumn(tag_column_name, array_contains(dataframe.tags_split, tag).cast("int")) return dataframe dataframe = one_hot_encode_top_n_tags(dataframe,20) tag_columns = [col for col in dataframe.columns if col.startswith('tag')] print("Tag-related columns") dataframe.select(tag_columns).show(10,False) dataframe.createOrReplaceTempView('df') df_tag_freq.createOrReplaceTempView('df_tag_freq') for n in range(1,6): dataframe = self.sqlContext.sql("SELECT df.*, df_tag_freq.frequency AS frequency_tag{} FROM df LEFT JOIN df_tag_freq ON df.tag{} = df_tag_freq.tag".format(n,n)) dataframe = dataframe.na.fill({"frequency_tag{}".format(n): 0}) dataframe.createOrReplaceTempView('df') dataframe = dataframe.withColumn("frequency_sum", col("frequency_tag1")+col("frequency_tag2")+col("frequency_tag3")+col("frequency_tag4")+col("frequency_tag5")) # Remove temporary columns dataframe = dataframe.select([c for c in dataframe.columns if c not in {"tags_split","tag1","tag2","tag3","tag4","tag5","frequency_tag1","frequency_tag2", \ "frequency_tag3","frequency_tag4","frequency_tag5"}]) return(dataframe)
def parse_dates(df, format): """ Parses dateinto year,month,day :param df: input df :param format: the format of the timestamp :return: dataframe """ return df.withColumn('parsed_date', f.to_timestamp(f.col('transaction_date'), format)) \ .withColumn("year", f.year(f.col('parsed_date'))) \ .withColumn("month", f.month(f.col('parsed_date'))) \ .withColumn("day", f.dayofmonth(f.col('parsed_date'))) \ .withColumn("unix_ts", f.unix_timestamp('parsed_date')) \ .drop("transaction_date")
def test_register_vectorized_udf_basic(self): df = self.spark.range(10).select( col('id').cast('int').alias('a'), col('id').cast('int').alias('b')) original_add = pandas_udf(lambda x, y: x + y, IntegerType()) self.assertEqual(original_add.deterministic, True) self.assertEqual(original_add.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) new_add = self.spark.catalog.registerFunction("add1", original_add) res1 = df.select(new_add(col('a'), col('b'))) res2 = self.spark.sql( "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t") expected = df.select(expr('a + b')) self.assertEquals(expected.collect(), res1.collect()) self.assertEquals(expected.collect(), res2.collect())
def test_smvDedupByKeyWithOrder_with_column(self): schema = "a:Integer; b:Double; c:String" df = self.createDF( schema, """1,2.0,hello; 1,3.0,hello; 2,10.0,hello2; 2,11.0,hello3""" ) r1 = df.smvDedupByKeyWithOrder(col("a"))(col("b").desc()) expect = self.createDF( schema, """1,3.0,hello; 2,11.0,hello3""" ) self.should_be_same(expect, r1)
def splitStrCol(self, column, featureNames, mark): """This functions split a column into different ones. In the case of this method, the column provided should be a string of the following form 'word,foo'. :param column Name of the target column, this column is going to be replaced. :param featureNames List of strings of the new column names after splitting the strings. :param mark String that specifies the splitting mark of the string, this frequently is ',' or ';'. """ # Check if column argument is a string datatype: self.__assertTypeStr(column, "column") # Check if mark argument is a string datatype: self.__assertTypeStr(mark, "mark") assert (column in self.__df.columns), "Error: column specified does not exist in dataFrame." assert (type(featureNames) == type([])), "Error: featureNames must be a list of strings." # Setting a udf that split the string into a list of strings. # This is "word, foo" ----> ["word", "foo"] func = udf(lambda x: x.split(mark), ArrayType(StringType())) self.__df = self.__df.withColumn(column, func(col(column))) self.undoVecAssembler(column=column, featureNames=featureNames) self.__addTransformation() # checkpoint in case return self
def test_smvExpandStruct(self): schema = "id:String;a:Double;b:Double" df1 = self.createDF(schema, "a,1.0,10.0;a,2.0,20.0;b,3.0,30.0") df2 = df1.select(col("id"), struct("a", "b").alias("c")) res = df2.smvExpandStruct("c") expect = self.createDF(schema, "a,1.0,10.0;a,2.0,20.0;b,3.0,30.0") self.should_be_same(expect, res)
def test_vectorized_udf_null_double(self): data = [(3.0,), (5.0,), (-1.0,), (None,)] schema = StructType().add("double", DoubleType()) df = self.spark.createDataFrame(data, schema) double_f = pandas_udf(lambda x: x, DoubleType()) res = df.select(double_f(col('double'))) self.assertEquals(df.collect(), res.collect())
def test_vectorized_udf_null_string(self): data = [("foo",), (None,), ("bar",), ("bar",)] schema = StructType().add("str", StringType()) df = self.spark.createDataFrame(data, schema) str_f = pandas_udf(lambda x: x, StringType()) res = df.select(str_f(col('str'))) self.assertEquals(df.collect(), res.collect())
def test_vectorized_udf_null_array(self): data = [([1, 2],), (None,), (None,), ([3, 4],), (None,)] array_schema = StructType([StructField("array", ArrayType(IntegerType()))]) df = self.spark.createDataFrame(data, schema=array_schema) array_f = pandas_udf(lambda x: x, ArrayType(IntegerType())) result = df.select(array_f(col('array'))) self.assertEquals(df.collect(), result.collect())
def test_vectorized_udf_null_decimal(self): data = [(Decimal(3.0),), (Decimal(5.0),), (Decimal(-1.0),), (None,)] schema = StructType().add("decimal", DecimalType(38, 18)) df = self.spark.createDataFrame(data, schema) decimal_f = pandas_udf(lambda x: x, DecimalType(38, 18)) res = df.select(decimal_f(col('decimal'))) self.assertEquals(df.collect(), res.collect())
def main(): # get dynamic frame source ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) #------------------------------------------------------------------------------------------------------------------# # ------------------------------------------------------------------------------------------------------------------# # diem thu tuan dyf_top_topica_question_mark_week = glueContext.create_dynamic_frame.from_catalog(database="moodle", table_name="top_topica_question_mark_week") dyf_top_topica_question_mark_week = dyf_top_topica_question_mark_week\ .select_fields(['attemptid', 'grade', 'quiz_name'])\ .rename_field('attemptid', 'attemptid_mark_week') \ .rename_field('grade', 'grade_mark_week')\ .rename_field('quiz_name', 'quiz_name_week') dyf_top_topica_question_mark_week = dyf_top_topica_question_mark_week\ .resolveChoice(specs=[('attemptid_mark_week', 'cast:long'), ('grade_mark_week', 'cast:float')]) df_top_topica_question_mark_week = dyf_top_topica_question_mark_week.toDF() df_top_topica_question_mark_week = df_top_topica_question_mark_week.dropDuplicates(['attemptid_mark_week']) if is_dev: print ('df_top_topica_question_mark_week') df_top_topica_question_mark_week.printSchema() # diem thi thang dyf_top_topica_question_marks = glueContext.create_dynamic_frame.from_catalog(database="moodle", table_name="top_topica_question_marks") dyf_top_topica_question_marks = dyf_top_topica_question_marks \ .select_fields(['attemptid', 'marks']) \ .rename_field('attemptid', 'attemptid_mark')\ .rename_field('marks', 'marks_month') dyf_top_topica_question_marks = dyf_top_topica_question_marks \ .resolveChoice(specs=[('attemptid_mark', 'cast:long')]) df_top_topica_question_marks = dyf_top_topica_question_marks.toDF() df_top_topica_question_marks = df_top_topica_question_marks.dropDuplicates(['attemptid_mark']) if is_dev: print ('df_top_topica_question_marks') df_top_topica_question_marks.printSchema() # ------------------------------------------------------------------------------------------------------------------# # dyf_student_package = glueContext.create_dynamic_frame.from_catalog(database="od_student_behavior", # table_name="student_package") # # print('dyf_student_package__0') # dyf_student_package.printSchema() # # dyf_student_package = dyf_student_package \ # .select_fields(['student_id', 'package_code', 'start_time', 'end_time']) \ # .rename_field('student_id', 'student_id_pk') # # dyf_student_package = dyf_student_package.resolveChoice( # specs=[('start_time', 'cast:long'), ('end_time', 'cast:long')]) # # df_student_package = dyf_student_package.toDF() # df_student_package = df_student_package.drop_duplicates() # ------------------------------------------------------------------------------------------------------------------# # dyf_student_package_status = glueContext.create_dynamic_frame.from_catalog(database="od_student_behavior", # table_name="student_status") # # dyf_student_package_status = dyf_student_package_status \ # .select_fields(['contact_id', 'status_code', 'start_date', 'end_date']) \ # .rename_field('contact_id', 'contact_id_ps') # # print('dyf_student_package_status::drop_duplicates') # # df_student_package_status = dyf_student_package_status.toDF() # df_student_package_status = df_student_package_status.drop_duplicates() # ------------------------------------------------------------------------------------------------------------------# dyf_result_ai = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_result_ai" ) dyf_result_ai = dyf_result_ai.select_fields( ['id', 'answer', '.speech_result', 'right_word', 'wrong_word', 'result', 'attempt_id'])\ .rename_field('attempt_id', 'attempt_id_result_ai') dyf_result_ai = dyf_result_ai.resolveChoice(specs=[('attempt_id_result_ai', 'cast:long')]) df_result_ai = dyf_result_ai.toDF() df_result_ai = df_result_ai.drop_duplicates(['attempt_id_result_ai']) # ------------------------------------------------------------------------------------------------------------------# dyf_moodle_top_user = glueContext.create_dynamic_frame.from_catalog(database="moodle", table_name="top_user") # Chon cac truong can thiet dyf_moodle_top_user = dyf_moodle_top_user.select_fields( ['id', 'username', 'levelstudy']) df_moodle_top_user = dyf_moodle_top_user.toDF() #------------------------------------------------------------------------------------------------------------------# dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id', 'user_name']) dyf_student_contact = Filter.apply(frame=dyf_student_contact, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '' and x["student_id"] is not None and x["student_id"] != '' and x["user_name"] is not None and x["user_name"] != '') df_student_contact = dyf_student_contact.toDF() # -------------------------------------------------------------------------------------------------------------------# dyf_moodle_question_attempts = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_attempts" ) dyf_moodle_question_attempts = dyf_moodle_question_attempts.select_fields( ['id', 'rightanswer', 'responsesummary', 'timemodified', 'maxmark', 'questionusageid', 'questionid']).rename_field('id', 'question_attempt_id') dyf_moodle_question_attempts = Filter.apply(frame=dyf_moodle_question_attempts, f=lambda x: x["questionusageid"] is not None and x["questionusageid"] != '') df_moodle_question_attempts = dyf_moodle_question_attempts.toDF() df_moodle_question_attempts = df_moodle_question_attempts.dropDuplicates(['questionusageid']) # -------------------------------------------------------------------------------------------------------------------# dyf_top_quiz = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_quiz" ) dyf_top_quiz = dyf_top_quiz.select_fields(['name', 'id']).rename_field('id', 'quiz_id') df_top_quiz = dyf_top_quiz.toDF() # -------------------------------------------------------------------------------------------------------------------# dyf_moodle_question_steps = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_attempt_steps" ) dyf_moodle_question_steps = dyf_moodle_question_steps\ .select_fields(['id', 'state', 'questionattemptid', 'timecreated']) df_moodle_question_steps = dyf_moodle_question_steps.toDF() df_moodle_question_steps = df_moodle_question_steps.dropDuplicates(['id']) # get latest question_steps state w2 = Window.partitionBy("questionattemptid").orderBy(f.col("timecreated").desc()) df_moodle_question_steps = df_moodle_question_steps.withColumn("row", f.row_number().over(w2)) \ .where(f.col('row') <= 1) df_moodle_question_steps.cache() if is_dev: print('df_moodle_question_steps after getting latest question_steps state') df_moodle_question_steps.show(2) df_moodle_question_steps = df_moodle_question_steps.drop('row', 'timecreated') df_moodle_question_steps.cache() # -------------------------------------------------------------------------------------------------------------------# # dyf_mapping_grammar_lo = glueContext.create_dynamic_frame.from_catalog( # database="moodle", # table_name="mapping_grammar_lo" # ) dyf_mapping_grammar_lo = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "mapping_grammar_lo", "redshiftTmpDir": "s3://dtsodin/temp/mapping_grammar_lo/v9" } ) dyf_mapping_grammar_lo = dyf_mapping_grammar_lo \ .select_fields(['question', 'lo', 'lc'])\ .rename_field('question', 'question_grammar_id') df_mapping_grammar_lo = dyf_mapping_grammar_lo.toDF() # -------------------------------------------------------------------------------------------------------------------# dyf_moodle_quiz_attempts = glueContext.create_dynamic_frame.from_catalog(database="moodle", table_name="top_quiz_attempts") # Chon cac truong can thiet if is_dev: print('dyf_moodle_quiz_attempts::original') dyf_moodle_quiz_attempts.printSchema() # try: # df_flag = spark.read.parquet("s3a://toxd-olap/transaction_log/flag/sb_native_test/sb_native_test.parquet") # read_from_index = df_flag.collect()[0]['flag'] # print('read from index: ', read_from_index) # dyf_moodle_quiz_attempts = Filter.apply(frame=dyf_moodle_quiz_attempts, # f=lambda x: x["_key"] > read_from_index) # except: # print('read flag file error ') dyf_moodle_quiz_attempts = dyf_moodle_quiz_attempts.select_fields( ['id', '_key', 'quiz', 'userid', 'sumgrades', 'uniqueid', 'timestart']) \ .rename_field('id', 'attempt_id')\ .rename_field('timestart', 'testing_time') # -------------------------------------------------------------------------------------------------------------------# if is_dev: print('df_moodle_question_attempts') df_moodle_question_attempts.printSchema() print('df_moodle_question_steps') df_moodle_question_steps.printSchema() print('df_top_quiz') df_top_quiz.printSchema() print('dyf_moodle_quiz_attempts') dyf_moodle_quiz_attempts.printSchema() print('df_moodle_top_user') df_moodle_top_user.printSchema() print('df_student_contact') df_student_contact.printSchema() # print ('df_student_package_status') # df_student_package_status.printSchema() #-------------------------------------------------------------------------------------------------------------------# dyf_moodle_quiz_attempts = Filter.apply(frame=dyf_moodle_quiz_attempts, f=lambda x: x['userid'] is not None and x['quiz'] is not None and x['testing_time'] > START_LOAD_DATE ) if is_just_monthly_exam: dyf_moodle_quiz_attempts = Filter.apply(frame=dyf_moodle_quiz_attempts, f=lambda x: x['userid'] is not None and x['quiz'] in [6L, 7L, 9L, 918L]) else: dyf_moodle_quiz_attempts = Filter.apply(frame=dyf_moodle_quiz_attempts, f=lambda x: x['userid'] is not None) df_moodle_quiz_attempts = dyf_moodle_quiz_attempts.toDF() df_moodle_quiz_attempts = df_moodle_quiz_attempts.dropDuplicates(['attempt_id']) df_moodle_quiz_attempts.cache() moodle_quiz_attempts_number = df_moodle_quiz_attempts.count() if is_dev: print ('moodle_quiz_attempts_number: ', moodle_quiz_attempts_number) if moodle_quiz_attempts_number < 1: return df_student_level = get_df_student_level(glueContext) df_student_level.cache() df_student_package = get_df_student_package(glueContext) df_student_package.cache() df_student_advisor = get_df_student_advisor(glueContext) df_student_advisor.cache() # Step 1: get user info, package_code, level, status df_quiz_student = df_moodle_quiz_attempts\ .join(df_moodle_top_user, df_moodle_quiz_attempts.userid == df_moodle_top_user.id)\ .join(df_student_contact, df_moodle_top_user.username == df_student_contact.user_name) package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L package_code_unavailable = 'UNAVAILABLE' student_level_code_unavailable = 'UNAVAILABLE' package_status_code_unavailable = 'UNAVAILABLE' df_quiz_student_original = df_quiz_student.select( 'id', 'uniqueid', 'quiz', 'levelstudy', 'attempt_id', df_quiz_student.testing_time.alias('student_behavior_date'), getBehaviorIdByQuiz(df_quiz_student.quiz).alias('behavior_id'), df_quiz_student.student_id.cast('long').alias('student_id'), 'contact_id', # f.lit(package_code_unavailable).alias('package_code'), # f.lit(package_endtime_unavailable).alias('package_endtime'), # f.lit(package_starttime_unavailable).alias('package_starttime'), # f.lit(student_level_code_unavailable).alias('student_level_code'), # f.lit(package_status_code_unavailable).alias('package_status_code'), f.lit(today_second).alias('transformed_at'), f.from_unixtime('testing_time', format="yyyyMM").alias('year_month_id') ) df_quiz_student_original =df_quiz_student_original\ .join(df_student_advisor, (df_quiz_student_original.contact_id == df_student_advisor.contact_id_advisor) & (df_quiz_student_original.student_behavior_date >= df_student_advisor.start_date) & (df_quiz_student_original.student_behavior_date < df_student_advisor.end_date), 'left' ) \ .join(df_student_package, (df_quiz_student_original.contact_id == df_student_package.contact_id_package) & (df_quiz_student_original.student_behavior_date >= df_student_package.package_start_time) & (df_quiz_student_original.student_behavior_date < df_student_package.package_end_time), 'left' ) \ .join(df_student_level, (df_quiz_student_original.contact_id == df_student_level.contact_id_level) & (df_quiz_student_original.student_behavior_date >= df_student_level.start_date) & (df_quiz_student_original.student_behavior_date < df_student_level.end_date), 'left' ) df_quiz_student_original = df_quiz_student_original \ .withColumn('student_behavior_id', f.md5(concaText( df_quiz_student_original.student_behavior_date, df_quiz_student_original.behavior_id, df_quiz_student_original.student_id, df_quiz_student_original.contact_id, df_quiz_student_original.package_code, df_quiz_student_original.package_status_code, df_quiz_student_original.student_level_code, df_quiz_student_original.transformed_at))) df_quiz_student_original.persist(StorageLevel.DISK_ONLY_2) # | -- id: long(nullable=true) # | -- uniqueid: long(nullable=true) # | -- quiz: long(nullable=true) # | -- levelstudy: string(nullable=true) # | -- attempt_id: long(nullable=true) # | -- student_behavior_date: long(nullable=true) # | -- behavior_id: long(nullable=true) # | -- student_id: long(nullable=true) # | -- contact_id: string(nullable=true) # | -- package_code: string(nullable=false) # | -- package_endtime: long(nullable=false) # | -- package_starttime: long(nullable=false) # | -- student_level_code: string(nullable=false) # | -- package_status_code: string(nullable=false) # | -- transformed_at: long(nullable=false) # | -- student_behavior_id: string(nullable=true) if is_dev: print('df_quiz_student_original') df_quiz_student_original.printSchema() df_quiz_student_original.show(1) # get data for getting testing detail df_quiz_student = df_quiz_student_original #1. save weekly native test for AI (speeking) # Step 2: Seperate result AI(Speaking) and question attempt # Step 2.1 Get result AI df_quiz_student_ai = df_quiz_student\ .join(df_result_ai, df_quiz_student.attempt_id == df_result_ai.attempt_id_result_ai, 'inner')\ .join(df_top_quiz, df_quiz_student.quiz == df_top_quiz.quiz_id, 'left') if is_limit_test: df_quiz_student_ai = df_quiz_student_ai.limit(100) if is_dev: print('df_quiz_student_ai') df_quiz_student_ai.printSchema() print('df_quiz_student_ai::after:separate:: ', df_quiz_student_ai.count()) source_system_native_test_ai = 'NATIVE_TEST_AI' source_system_native_test_simple = 'NATIVE_TEST_SIMPLE' source_system_native_test_grammar = 'NATIVE_TEST_GRAMMAR' current_step_unavailable = -1L total_step_unavailable = -1L learning_category_id_unavailable = -1L learning_unit_code_unavailable = 'UNAVAILABLE' learning_object_type_code_unavailable = 'UNAVAILABLE' learning_object_id_unavailable = -1L learning_object_unavailable = 'UNAVAILABLE' learning_category_code_unavailable = 'UNAVAILABLE' student_answer_detail_unavailable = 'UNAVAILABLE' duration_unavailable = -1L max_point_unavailable = -1L received_point_unavailable = -2L test_type_unavailable = 'UNAVAILABLE' right_answer_unavailable = 'UNAVAILABLE' wrong_answer_unavailable = 'UNAVAILABLE' # # # # #------------------------------------------------------------------------------------------------------------------# # # # # #------------------------------------------------------------------------------------------------------------------# # if is_dev: print('df_quiz_student_ai') df_quiz_student_ai.printSchema() df_quiz_student_ai.show(1) # Step 2.2 Get data for result AI df_quiz_student_ai_full = df_quiz_student_ai.select( 'student_behavior_id', 'student_behavior_date', 'behavior_id', 'student_id', 'contact_id', # 'package_code', # df_quiz_student_ai.end_time.cast('long').alias('package_endtime'), # df_quiz_student_ai.start_time.cast('long').alias('package_starttime'), # # df_quiz_student_ai.levelstudy.alias('student_level_code'), # df_quiz_student_ai.status_code.alias('package_status_code'), 'package_code', # 'package_endtime', # 'package_starttime', 'student_level_code', 'package_status_code', 'transformed_at', 'attempt_id', #for student_test_detail f.lit(source_system_native_test_ai).alias('source_system'), df_quiz_student_ai.name.alias('test_type'), df_quiz_student_ai.attempt_id_result_ai.cast('long').alias('attempt_step_id'), f.lit(current_step_unavailable).cast('long').alias('current_step'), f.lit(total_step_unavailable).cast('long').alias('total_step'), f.lit(learning_category_id_unavailable).cast('long').alias('learning_category_id'), f.lit(learning_category_code_unavailable).alias('learning_category_code'), f.lit(learning_unit_code_unavailable).cast('string').alias('learning_unit_code'), f.lit(learning_object_type_code_unavailable).cast('string').alias('learning_object_type_code'), f.lit(learning_object_id_unavailable).cast('long').alias('learning_object_id'), f.lit(learning_object_unavailable).cast('string').alias('learning_object'), df_quiz_student_ai.answer.cast('string').alias('correct_answer'), df_quiz_student_ai.speech_result.cast('string').alias('student_answer'), f.lit(student_answer_detail_unavailable).cast('string').alias('student_answer_detail'), 'result', df_quiz_student_ai.right_word.cast('string').alias('right_answer'), df_quiz_student_ai.wrong_word.cast('string').alias('wrong_answer'), f.lit(duration_unavailable).cast('long').alias('duration'), f.lit(max_point_unavailable).cast('long').alias('max_point'), f.lit(received_point_unavailable).cast('long').alias('received_point'), 'year_month_id' ) if is_dev: print('df_quiz_student_ai_full') df_quiz_student_ai_full.printSchema() # # Step 3.1 Get data for question_attempts df_quiz_student_question = df_quiz_student\ .join(df_moodle_question_attempts, df_quiz_student.uniqueid == df_moodle_question_attempts.questionusageid, 'inner')\ .join(df_moodle_question_steps, df_moodle_question_attempts.question_attempt_id == df_moodle_question_steps.questionattemptid, 'left')\ .join(df_mapping_grammar_lo, df_moodle_question_attempts.question_attempt_id == df_mapping_grammar_lo.question_grammar_id, 'left') if is_limit_test: df_quiz_student_question = df_quiz_student_question.limit(100) if is_dev: print('df_quiz_student_question') df_quiz_student_question.printSchema() print('df_quiz_student_question: ', df_quiz_student_question.count()) def getSourceSystemByLC(lc): if lc in ['G01', 'G02', 'G03', 'G04', 'G05']: return source_system_native_test_grammar return source_system_native_test_simple getSourceSystemByLC = f.udf(getSourceSystemByLC, StringType()) if is_dev: print('df_quiz_student_question') df_quiz_student_question.printSchema() df_quiz_student_question.show(1) #Step 3.2 Get data for question_attempts df_quiz_student_question_full = df_quiz_student_question.select( 'student_behavior_id', 'student_behavior_date', 'behavior_id', 'student_id', 'contact_id', # 'package_code', # df_quiz_student_question.end_time.cast('long').alias('package_endtime'), # df_quiz_student_question.start_time.cast('long').alias('package_starttime'), # # df_quiz_student_question.levelstudy.alias('student_level_code'), # df_quiz_student_question.status_code.alias('package_status_code'), 'package_code', # 'package_endtime', # 'package_starttime', 'student_level_code', 'package_status_code', 'transformed_at', 'attempt_id', # for student_test_detail getSourceSystemByLC(df_quiz_student_question.lc).alias('source_system'), f.lit(test_type_unavailable).alias('test_type'), df_quiz_student_question.question_attempt_id.cast('long').alias('attempt_step_id'), f.lit(current_step_unavailable).cast('long').alias('current_step'), f.lit(total_step_unavailable).cast('long').alias('total_step'), f.lit(learning_category_id_unavailable).cast('long').alias('learning_category_id'), df_quiz_student_question.lc.cast('string').alias('learning_category_code'), f.lit(learning_unit_code_unavailable).cast('string').alias('learning_unit_code'), f.lit(learning_object_type_code_unavailable).cast('string').alias('learning_object_type_code'), f.lit(learning_object_id_unavailable).cast('long').alias('learning_object_id'), df_quiz_student_question.lo.cast('string').alias('learning_object'), df_quiz_student_question.rightanswer.cast('string').alias('correct_answer'), df_quiz_student_question.responsesummary.cast('string').alias('student_answer'), f.lit(student_answer_detail_unavailable).cast('string').alias('student_answer_detail'), df_quiz_student_question.state.alias('result'), f.lit(right_answer_unavailable).alias('right_answer'), f.lit(wrong_answer_unavailable).alias('wrong_answer'), f.lit(duration_unavailable).cast('long').alias('duration'), f.lit(max_point_unavailable).cast('long').alias('max_point'), df_quiz_student_question.maxmark.cast('long').alias('received_point'), 'year_month_id' ) if is_dev: print ('df_quiz_student_question_full') df_quiz_student_question_full.printSchema() print('df_quiz_student_ai_full::before::union') print('df_quiz_student_ai_full::number: ', df_quiz_student_ai_full.count()) print('df_quiz_student_question_full::number: ', df_quiz_student_question_full.count()) # df_quiz_full = df_quiz_student_ai_full.union(df_quiz_student_question_full) if is_dev: print('df_quiz_full') df_quiz_full.printSchema() df_quiz_full.show(3) # # # # # #save to student behavior # dyf_quiz_full = DynamicFrame.fromDF(df_quiz_full, glueContext, 'dyf_quiz_full') # # #Save to apply_ouput_test_detail = ApplyMapping.apply(frame=dyf_quiz_full, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("attempt_id", "long", "attempt_id", "long"), ("source_system", "string", "source_system", "string"), ("test_type", "string", "test_type", "string"), ("attempt_step_id", "long", "attempt_step_id", "long"), ("current_step", "long", "current_step", "long"), ("total_step", "long", "total_step", "long"), ("learning_category_id", "long", "learning_category_id", "long"), ("learning_category_code", "string", "learning_category_code", "string"), ("learning_unit_code", "string", "learning_unit_code", "string"), ("learning_object_type_code", "string", "learning_object_type_code", "string"), ("learning_object_id", "long", "learning_object_id", "long"), ("learning_object", "string", "learning_object", "string"), ("correct_answer", "string", "correct_answer", "string"), ("student_answer", "string", "student_answer", "string"), ("student_answer_detail", "string", "student_answer_detail", "string"), ("result", "string", "result", "string"), ("right_answer", "string", "right_answer", "string"), ("wrong_answer", "string", "wrong_answer", "string"), ("duration", "long", "duration", "long"), ("max_point", "long", "max_point", "long"), ("received_point", "long", "received_point", "long"), ("student_behavior_date", "long", "created_at", "long"), ("behavior_id", "long", "behavior_id", "long"), ("year_month_id", "string", "year_month_id", "long") ]) dfy_output_test = ResolveChoice.apply(frame=apply_ouput_test_detail, choice="make_cols", transformation_ctx="resolvechoice2") # # save to s3 glueContext.write_dynamic_frame.from_options( frame=dfy_output_test, connection_type="s3", connection_options={"path": "s3://toxd-olap/transaction_log/student_behavior/sb_student_test_detail", "partitionKeys": ["behavior_id", "year_month_id"]}, format="parquet") # save to redshift glueContext.write_dynamic_frame.from_jdbc_conf(frame=dfy_output_test, catalog_connection="glue_redshift", connection_options={ "dbtable": "sb_student_test_detail", "database": "transaction_log" }, redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/student_behavior/sb_student_test_detail", transformation_ctx="datasink4") # #-------------------------------------------------------------------------------------------------------------------# # # for save behavior df_quiz_full = df_quiz_full.dropDuplicates(['student_behavior_id']) dyf_quiz_student_behavior = DynamicFrame.fromDF(df_quiz_student_original, glueContext, 'dyf_quiz_student_behavior') apply_ouput_hehavior = ApplyMapping.apply(frame=dyf_quiz_student_behavior, mappings=[ ("student_behavior_id", "string", "student_behavior_id", "string"), ("student_behavior_date", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "int"), ("student_id", "long", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "string", "package_code", "string"), ("student_level_code", "string", "student_level_code", "string"), ("package_status_code", "string", "package_status_code", "string"), ("advisor_id", "long", "advisor_id", "long"), ("transformed_at", "long", "transformed_at", "long"), ("year_month_id", "string", "year_month_id", "long") ]) dfy_output = ResolveChoice.apply(frame=apply_ouput_hehavior, choice="make_cols", transformation_ctx="resolvechoice2") # save to s3 glueContext.write_dynamic_frame.from_options( frame=dfy_output, connection_type="s3", connection_options={"path": "s3://toxd-olap/transaction_log/student_behavior/sb_student_behavior", "partitionKeys": ["behavior_id", "year_month_id"]}, format="parquet") # save to redshift glueContext.write_dynamic_frame.from_jdbc_conf(frame=dfy_output, catalog_connection="glue_redshift", connection_options={ "dbtable": "sb_student_behavior", "database": "transaction_log" }, redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/student_behavior/sb_student_behavior", transformation_ctx="datasink4") # # -------------------------------------------------------------------------------------------------------------------# # # # Step 5 - get marks if is_dev: print('Check before get marks') print ('df_top_topica_question_marks') df_top_topica_question_marks.printSchema() print ('df_top_topica_question_marks') df_top_topica_question_mark_week.printSchema() df_quiz_mark = df_quiz_student_original.select('student_behavior_id', 'behavior_id', 'attempt_id', 'year_month_id') dyf_quiz_mark = DynamicFrame.fromDF(df_quiz_mark, glueContext, 'dyf_quiz_mark') #check du lieu df_quiz_mark datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_quiz_mark, catalog_connection="glue_redshift", connection_options={ "dbtable": "dyf_quiz_mark", "database": "dts_odin_checking" }, redshift_tmp_dir="s3://dts-odin/dts_odin_checking/temp/dyf_quiz_mark", transformation_ctx="datasink4") df_quiz_week = df_quiz_mark.where(df_quiz_mark.behavior_id == BEHAVIOR_ID_TEST_TUAN) df_quiz_month = df_quiz_mark.where(df_quiz_mark.behavior_id == BEHAVIOR_ID_TEST_THANG) # # # ------------------------------------------------------------------------------------------------------------------# df_quiz_week_marks = df_quiz_week.join(df_top_topica_question_mark_week, df_quiz_week.attempt_id == df_top_topica_question_mark_week.attemptid_mark_week, 'left' ) # if is_dev: print('df_quiz_week_marks::after_join_df_top_topica_question_mark_week') df_quiz_week_marks.printSchema() df_quiz_week_marks.show(10) # df_quiz_week_marks = df_quiz_week_marks.na.fill({'grade_mark_week': 0}) # def convertIntergerToFloat(grade_mark_week): if grade_mark_week is None: return float(0.0) return float(grade_mark_week) convertIntergerToFloat = f.udf(convertIntergerToFloat, FloatType()) # # df_quiz_week_marks = df_quiz_week_marks.select( 'behavior_id', 'attempt_id', 'student_behavior_id', df_quiz_week_marks.quiz_name_week.alias('question_category'), df_quiz_week_marks.grade_mark_week.alias('grade_t'), 'year_month_id' ) # if is_dev: print('df_quiz_week_marks') df_quiz_week_marks.printSchema() df_quiz_week_marks.show(2) # dyf_quiz_week_marks = DynamicFrame.fromDF(df_quiz_week_marks, glueContext, 'dyf_quiz_week_marks') datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_quiz_week_marks, catalog_connection="glue_redshift", connection_options={ "dbtable": "dyf_quiz_week_marks", "database": "dts_odin_checking" }, redshift_tmp_dir="s3://dts-odin/dts_odin_checking/temp/dyf_quiz_week_marks", transformation_ctx="datasink4") # # #------------------------------------------------------------------------------------------------------------------# MARK_UNAVAILABLE = '-1' def getMapFromStringJson(str_value): if str_value is None: return { {"VOCABULARY": MARK_UNAVAILABLE, "CONVERSATIONAL_EXPRESSION": MARK_UNAVAILABLE, "LISTENING": MARK_UNAVAILABLE, "DICTATION": MARK_UNAVAILABLE, "GRAMMAR": MARK_UNAVAILABLE, "READING": MARK_UNAVAILABLE } } str_value = str(str_value) json_value = json.loads(str_value) return json_value # getMapFromStringJson = f.udf(getMapFromStringJson, MapType(StringType(), StringType())) # df_quiz_month_marks = df_quiz_month.join(df_top_topica_question_marks, df_quiz_month.attempt_id == df_top_topica_question_marks.attemptid_mark, 'inner') if is_dev: print('df_quiz_month_marks after join question marks') df_quiz_month_marks.printSchema() df_quiz_month_marks.show(5) df_quiz_month_marks = df_quiz_month_marks.select( 'behavior_id', 'attempt_id', 'student_behavior_id', getMapFromStringJson(df_quiz_month_marks.marks_month).alias('marks_month_dict'), 'year_month_id' ) # if is_dev: print('df_quiz_month_marks after join question marks::after convert marks_month_dict') df_quiz_month_marks.printSchema() df_quiz_month_marks.show(5) df_quiz_month_marks = df_quiz_month_marks.select( 'behavior_id', 'attempt_id', 'student_behavior_id', f.explode(df_quiz_month_marks.marks_month_dict), 'year_month_id' ) # if is_dev: print('df_quiz_month_marks after join question marks::after explode') df_quiz_month_marks.printSchema() df_quiz_month_marks.show(5) # df_quiz_month_marks = df_quiz_month_marks.select( 'behavior_id', 'attempt_id', 'student_behavior_id', df_quiz_month_marks.key.alias('question_category'), df_quiz_month_marks.value.cast('float').alias('grade_t'), 'year_month_id' ) # if is_dev: print('df_quiz_month_marks::complete') df_quiz_month_marks.printSchema() df_quiz_month_marks.show(3) dyf_quiz_month_marks = DynamicFrame.fromDF(df_quiz_month_marks, glueContext, 'dyf_quiz_month_marks') datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_quiz_month_marks, catalog_connection="glue_redshift", connection_options={ "dbtable": "dyf_quiz_month_marks", "database": "dts_odin_checking" }, redshift_tmp_dir="s3://dts-odin/dts_odin_checking/temp/dyf_quiz_month_marks", transformation_ctx="datasink4") # # ------------------------------------------------------------------------------------------------------------------# # df_quiz_month_marks_full = df_quiz_week_marks.union(df_quiz_month_marks) df_quiz_month_marks_full = df_quiz_month_marks_full.dropDuplicates(['attempt_id', 'question_category']) dyf_quiz_month_marks_full = DynamicFrame.fromDF(df_quiz_month_marks_full, glueContext, 'dyf_quiz_month_marks_full') datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_quiz_month_marks_full, catalog_connection="glue_redshift", connection_options={ "dbtable": "dyf_quiz_month_marks_full", "database": "dts_odin_checking" }, redshift_tmp_dir="s3://dts-odin/dts_odin_checking/temp/dyf_quiz_month_marks_full", transformation_ctx="datasink4") # QUESTION_CATEGORY__UNAVAILABLE = 'UNAVAILABLE' df_quiz_month_marks_full = df_quiz_month_marks_full.na.fill( {'question_category': QUESTION_CATEGORY__UNAVAILABLE, 'grade_t': MARK_UNAVAILABLE} ) if is_dev: print ('df_quiz_month_marks_full') df_quiz_month_marks_full.printSchema() df_quiz_month_marks_full.show(3) # # dyf_quiz_month_marks_full = DynamicFrame.fromDF(df_quiz_month_marks_full, glueContext, 'dyf_quiz_month_marks_full') # # apply_dyf_quiz_month_marks_full = ApplyMapping.apply(frame=dyf_quiz_month_marks_full, mappings=[("behavior_id", "long", "behavior_id", "long"), ("attempt_id", "long", "attempt_id", "long"), ("student_behavior_id", "string", "student_behavior_id", "string"), ("question_category", "string", "question_category", "string"), ("grade_t", "float", "grade", "float"), ("year_month_id", "string", "year_month_id", "long") ]) # dyf_quiz_month_marks_full_output = ResolveChoice.apply(frame=apply_dyf_quiz_month_marks_full, choice="make_cols", transformation_ctx="resolvechoice2") # # save to s3 glueContext.write_dynamic_frame.from_options( frame=dyf_quiz_month_marks_full_output, connection_type="s3", connection_options={"path": "s3://toxd-olap/transaction_log/student_behavior/sb_student_test_mark", "partitionKeys": ["behavior_id", "year_month_id"]}, format="parquet") #save to redshift glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_quiz_month_marks_full_output, catalog_connection="glue_redshift", connection_options={ "dbtable": "sb_student_test_mark", "database": "transaction_log" }, redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/student_behavior/sb_student_test_mark", transformation_ctx="datasink4") # # df_quiz_full.unpersist() # df_moodle_quiz_attempts.unpersist() # df_moodle_question_steps.unpersist() df_mdl_logsservice_in_out = dyf_moodle_quiz_attempts.toDF() flag = df_mdl_logsservice_in_out.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://toxd-olap/transaction_log/flag/sb_native_test/sb_native_test.parquet", mode="overwrite") df_quiz_student_original.unpersist()
from pyspark.sql.functions import col, when from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import VectorAssembler import os os.environ["SPARK_HOME"] = "C:/spark-2.4.5-bin-hadoop2.7" os.environ["HADOOP_HOME"] = "C:/winutils" # Create spark session spark = SparkSession.builder.appName("ICP 14").getOrCreate() spark.sparkContext.setLogLevel("ERROR") # Load data and select feature and label columns data = spark.read.format("csv").option("header", True).option( "inferSchema", True).option("delimiter", ",").load("C:/Users/Lalith Chandra A/Downloads/car.csv") data = data.withColumn("label", when(col("engine-location") == "front", 1).otherwise(0)).select("label", "length") # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model model = lr.fit(data) # Print the coefficients and intercept for logistic regression print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) # We can also use the multinomial family for binary classification mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")
from pyspark.sql.functions import col #creating a dataframe df3 = df2.withColumn('Weather_Station', df2['value'].substr(5, 6))\ .withColumn('WBAN', df2['value'].substr(11, 5))\ .withColumn('Observation_Date',to_date(df2['value'].substr(16,8),"yyyyMMdd"))\ .withColumn('Observation_Hour', df2['value'].substr(24, 4).cast(IntegerType()))\ .withColumn('Latitude', df2['value'].substr(29, 6).cast('float') / 1000)\ .withColumn('Longitude', df2['value'].substr(35, 7).cast('float') / 1000)\ .withColumn('Elevation', df2['value'].substr(47, 5).cast(IntegerType()))\ .withColumn('Wind_Direction', df2['value'].substr(61, 3).cast(IntegerType()))\ .withColumn('WD_Quality_Code', df2['value'].substr(64, 1).cast(IntegerType()))\ .withColumn('Sky_Ceiling_Height', df2['value'].substr(71, 5).cast(IntegerType()))\ .withColumn('SC_Quality_Code', df2['value'].substr(76, 1).cast(IntegerType()))\ .withColumn('Visibility_Distance', df2['value'].substr(79, 6).cast(IntegerType()))\ .withColumn('VD_Quality_Code', df2['value'].substr(86, 1).cast(IntegerType()))\ .withColumn('Air_Temperature', df2['value'].substr(88, 5).cast('float') /10)\ .withColumn('AT_Quality_Code', df2['value'].substr(93, 1).cast(IntegerType()))\ .withColumn('Dew_Point', df2['value'].substr(94, 5).cast('float'))\ .withColumn('DP_Quality_Code', df2['value'].substr(99, 1).cast(IntegerType()))\ .withColumn('Atmospheric_Pressure', df2['value'].substr(100, 5).cast('float')/ 10)\ .withColumn('AP_Quality_Code', df2['value'].substr(105, 1).cast(IntegerType())) df3.show(10) #filtering the air pressure data in which there's no 9999.9 df_AP_NoBadRecords = df3.filter(col("Atmospheric_Pressure") != 9999.9) #writing the file df_AP_NoBadRecords.write.format("csv").mode("overwrite").option( "header", "true").save( "hdfs://namenode/output/itmd-521/tdp/2001/valid-atmospheric-pressure")
df_port_counts = df.filter(col('possible_HP')==True).select("DstAddr", "Dport").distinct().groupBy("Dport").count() df = df.join(df_port_counts, ["Dport"],how="left") total_count = df.select('DstAddr').distinct().count() df = df.withColumn("total_count", lit(total_count)) #place Honeypots df = df.withColumn("chosenToBeHP1", place_honeypots_udf_mix(df["Dport"], df["count"], df["total_count"])) df = df.withColumn("isHP1", col('chosenToBeHP1') & col('possible_HP')) df = df.withColumn("chosenToBeHP2", place_honeypots_udf_cznic(df["Dport"], df["count"], df["total_count"])) df = df.withColumn("isHP2", col('chosenToBeHP2') & col('possible_HP')) #filter out flows without honeypots df_att_det_mix = df.filter(df.isHP1).groupBy('SrcAddr').agg(F.min(F.col('timestamp')).alias("detectionTime_mix")) df_att_det_cznic = df.filter(df.isHP2).groupBy('SrcAddr').agg(F.min(F.col('timestamp')).alias("detectionTime_cznic")) df = df.join(df_att_det_mix, ['SrcAddr'], how="left") df = df.join(df_att_det_cznic, ['SrcAddr'], how="left") # count saved try: saved_mix = df.filter(col('detectionTime_mix').isNotNull()).filter(~df["possible_HP"]).filter(col('timestamp')>col('detectionTime_mix')).count() saved_cznic = df.filter(col('detectionTime_cznic').isNotNull()).filter(~df["possible_HP"]).filter(col('timestamp')>col('detectionTime_cznic')).count() print('In ', input_file, ' saved_mix ', str(saved_mix), ' saved_cznic ', str(saved_cznic)) except py4j.protocol.Py4JJavaError: print('oops! Something went wrong.') sqlContext.clearCache() print("DONE")
schema = StructType([ StructField("@metadata", StringType()), StructField("@timestamp", StringType()), StructField("name", StringType()), StructField("payload", StringType()), StructField("well_id", StringType()) ]) print("Creating static df") static_spark_reader = spark.read.format("kafka").option( "kafka.bootstrap.servers", "kafka-cluster-kafka-bootstrap.ddt-persistence.svc.cluster.local:9092" ).option("subscribe", "ddt").option("startingOffsets", "earliest").load() #static_spark_reader.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").write.format("parquet").mode("Overwrite").save(output_path) static_spark_reader.selectExpr("CAST(key AS STRING) as key", "CAST(value AS STRING) as value")\ .select(from_json(col("value").cast("string"), schema).alias("value"))\ .write.format("parquet")\ .mode("append")\ .option("checkpointLocation", checkpoint_path)\ .option("path", output_path)\ .save() """ spark.readStream.format("kafka")\ .option("kafka.bootstrap.servers", "kafka-cluster-kafka-bootstrap.ddt-persistence.svc.cluster.local:9092")\ .option("subscribe", "ddt").option("startingOffsets", "latest")\ .load()\ .selectExpr("CAST(key AS STRING) as key", "CAST(value AS STRING) as value")\ .select(from_json(col("value").cast("string"), schema).alias("value"))\ .writeStream.format("parquet")\ .outputMode("append")\ .option("path", output_path)\
from pyspark.sql.types import StructType, StructField, IntegerType, StringType spark = SparkSession.builder.appName("MostPopularSuperhero").getOrCreate() schema = StructType([ \ StructField("id", IntegerType(), True), \ StructField("name", StringType(), True)]) names = spark.read.schema(schema).option( "sep", " ").csv("file:///SparkCourse/Marvel+Names.txt") lines = spark.read.text("file:///SparkCourse/Marvel+Graph.txt") # Small tweak vs. what's shown in the video: we trim each line of whitespace as that could # throw off the counts. connections = lines.withColumn("id", func.split(func.trim(func.col("value")), " ")[0]) \ .withColumn("connections", func.size(func.split(func.trim(func.col("value")), " ")) - 1) \ .groupBy("id").agg(func.sum("connections").alias("connections")) minConnections = connections.agg(func.min("connections")).first()[0] minConnectionHeroes = connections.filter( func.col("connections") == minConnections) minConnectionHeroesNames = minConnectionHeroes.join(names, "id") print("The following characters have only " + str(minConnections) + " co-appearances.") minConnectionHeroesNames.select("name").show()
'Eritrea', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Iran', 'Iraq', 'Israel', 'Ivory Coast', 'Jordan', 'Kenya', 'Kuwait', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Oman', 'Palestine', 'Republic of Congo', 'Rwanda', 'Saudi Arabia', 'Senegal', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Syria', 'Tanzania', 'Togo', 'Tunisia', 'Turkey', 'Uganda', 'United Arab Emirates', 'Yemen', 'Zambia', 'Zimbabwe' ] conflict_quads = ['Verbal Conflict', 'Material Conflict'] # COMMAND ---------- # DBTITLE 1,Select specified preprocessed data gdelt2021 = preprocessedGDELT.filter(F.col('ActionGeo_FullName').isin(countries)) \ .filter(F.col('EventTimeDate') >= F.lit('2021-03-01')) # add conflict, not binary column gdelt2021 = gdelt2021.withColumn( 'Conflict', F.when(F.col('QuadClassString').isin(conflict_quads), True).otherwise(False)) gdelt2021.limit(2).toPandas() # COMMAND ---------- # DBTITLE 1,Create Initial Report Variables # create function to calculate median median_udf = F.udf(lambda x: float(np.quantile(x, 0.5)), FloatType())
def compute_hist(psdf, bins): # 'data' is a Spark DataFrame that selects one column. assert isinstance(bins, (np.ndarray, np.generic)) sdf = psdf._internal.spark_frame scols = [] input_column_names = [] for label in psdf._internal.column_labels: input_column_name = name_like_string(label) input_column_names.append(input_column_name) scols.append( psdf._internal.spark_column_for(label).alias( input_column_name)) sdf = sdf.select(*scols) # 1. Make the bucket output flat to: # +----------+-------+ # |__group_id|buckets| # +----------+-------+ # |0 |0.0 | # |0 |0.0 | # |0 |1.0 | # |0 |2.0 | # |0 |3.0 | # |0 |3.0 | # |1 |0.0 | # |1 |1.0 | # |1 |1.0 | # |1 |2.0 | # |1 |1.0 | # |1 |0.0 | # +----------+-------+ colnames = sdf.columns bucket_names = ["__{}_bucket".format(colname) for colname in colnames] output_df = None for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)): # creates a Bucketizer to get corresponding bin of each value bucketizer = Bucketizer(splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip") bucket_df = bucketizer.transform(sdf) if output_df is None: output_df = bucket_df.select( SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")) else: output_df = output_df.union( bucket_df.select( SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket"))) # 2. Calculate the count based on each group and bucket. # +----------+-------+------+ # |__group_id|buckets| count| # +----------+-------+------+ # |0 |0.0 |2 | # |0 |1.0 |1 | # |0 |2.0 |1 | # |0 |3.0 |2 | # |1 |0.0 |2 | # |1 |1.0 |3 | # |1 |2.0 |1 | # +----------+-------+------+ result = (output_df.groupby("__group_id", "__bucket").agg( F.count("*").alias("count")).toPandas().sort_values( by=["__group_id", "__bucket"])) # 3. Fill empty bins and calculate based on each group id. From: # +----------+--------+------+ # |__group_id|__bucket| count| # +----------+--------+------+ # |0 |0.0 |2 | # |0 |1.0 |1 | # |0 |2.0 |1 | # |0 |3.0 |2 | # +----------+--------+------+ # +----------+--------+------+ # |__group_id|__bucket| count| # +----------+--------+------+ # |1 |0.0 |2 | # |1 |1.0 |3 | # |1 |2.0 |1 | # +----------+--------+------+ # # to: # +-----------------+ # |__values1__bucket| # +-----------------+ # |2 | # |1 | # |1 | # |2 | # |0 | # +-----------------+ # +-----------------+ # |__values2__bucket| # +-----------------+ # |2 | # |3 | # |1 | # |0 | # |0 | # +-----------------+ output_series = [] for i, (input_column_name, bucket_name) in enumerate(zip(input_column_names, bucket_names)): current_bucket_result = result[result["__group_id"] == i] # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)}) # merges the bins with counts on it and fills remaining ones with zeros pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[["count"]] pdf.columns = [input_column_name] output_series.append(pdf[input_column_name]) return output_series
def outliers(data, colname, lfence, ufence): # Builds expression to identify outliers expression = F.col("`%s`" % colname).between(lfence, ufence) # Creates a column to flag rows as outliers or not return data._psdf._internal.resolved_copy.spark_frame.withColumn( "__{}_outlier".format(colname), ~expression)
from pyspark.sql.column import Column from pyspark.sql.column import _to_java_column from pyspark.sql.column import _to_seq from pyspark.sql import SparkSession from pyspark.sql.functions import col from pyspark.sql.functions import * from pyspark.sql.types import IntegerType if __name__ == "__main__": spark = SparkSession.builder.appName("SimpleApp").getOrCreate() sc = spark.sparkContext def udfIpToIntScalaWrapper(ipString): _ipToIntUDF = sc._jvm.CustomUDFs.ipToIntUDF() return Column( _ipToIntUDF.apply(_to_seq(sc, [ipString], _to_java_column))) df = spark.createDataFrame(["192.168.0.1"], "string").toDF("ip") df\ .withColumn("ip_int_scala", udfIpToIntScalaWrapper(col("ip")))\ .show()
sys.path.append('/home/kanak/spark-2.4.7-bin-hadoop2.7/python') sys.path.append( '/home/kanak/spark-2.4.7-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip') from pyspark.sql import SparkSession from pyspark.sql.functions import explode, split, col from pyspark.sql.types import StructType, StructField, StringType, IntegerType if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("Rate Streaming")\ .getOrCreate() spark.sparkContext.setLogLevel("ERROR") df = spark.readStream.format("rate").option("rowsPerSecond", 3).load() resultDF = df.withColumn("newValue", col("value") + 1) query = resultDF\ .writeStream\ .outputMode('append')\ .option("truncate", False)\ .format('console')\ .start() query.awaitTermination()
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ Return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default. Parameters ---------- normalize : boolean, default False If True then the object returned will contain the relative frequencies of the unique values. sort : boolean, default True Sort by values. ascending : boolean, default False Sort in ascending order. bins : Not Yet Supported dropna : boolean, default True Don't include counts of NaN. Returns ------- counts : Series See Also -------- Series.count: Number of non-NA elements in a Series. Examples -------- >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]}) >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 Name: x, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE 1.0 0.6 0.0 0.4 Name: x, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 NaN 1 Name: x, dtype: int64 """ if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: sdf_dropna = self._kdf._sdf.filter(self.notna()._scol) else: sdf_dropna = self._kdf._sdf sdf = sdf_dropna.groupby(self._scol).count() if sort: if ascending: sdf = sdf.orderBy(F.col('count')) else: sdf = sdf.orderBy(F.col('count').desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' kdf = DataFrame(sdf) kdf.columns = [index_name, self.name] kdf._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(kdf)
def final_table(self): final_df = self.df_dict('empty_df') for module_name in self.config['module_names']: if (self.config['module_names'][module_name] == 'Y'): try: module_df = self.sqlContext.table( self.config_dict['output_db'] + '.' + self.config_dict['output_prefix'] + '_' + module_name + '_' + self.config_dict['output_suffix']) module_df.cache() module_df.show() except Exception as e: print 'Unable to read table : {}.{}_{}_{}'.format( self.config_dict['output_db'], self.config_dict['output_prefix'], module_name, self.config_dict['output_suffix']) print(e) if utils.valid_df(module_df): module_df.cache() module_df.show() common = list( set(module_df.columns).intersection(final_df.columns)) union = list( set(module_df.columns).union(final_df.columns)) if len(common) > 0: measures = list(set(union) - set(common)) diff_grouping_cloumns = list( set(measures) - set(self.measures)) final_df = module_df.join( final_df, common, 'outer').fillna('total', diff_grouping_cloumns).fillna( '0', self.measures) else: union_df = self.sqlContext.createDataFrame( [[''] * len(union)], union).filter(col(union[1]) != '') final_df = functions.union_multi_df( union_df, final_df, module_df, column_sequence_df=1) final_df.cache() final_df.show() if utils.valid_df(final_df): final_df.cache() final_df.show() final_df.registerTempTable('final_df_table') self.sqlContext.sql("drop table if exists " + self.config_dict['output_db'] + "." + self.config_dict['output_prefix'] + "_" + self.config_dict['final_table_name'] + "_" + self.config_dict['output_suffix']) self.sqlContext.sql("create table " + self.config_dict['output_db'] + "." + self.config_dict['output_prefix'] + "_" + self.config_dict['final_table_name'] + "_" + self.config_dict['output_suffix'] + " as select * from final_df_table") # final_df.write.saveAsTable( # self.output_db + '.' + self.output_prefix + '_' + self.final_table_name + '_' + self.output_suffix, # mode = self.write_mode # ) if (self.config_dict['flush_module_tables'] == 'Y'): self.flush_module_tables() return final_df
StructField('middlename', StringType(), True), StructField('lastname', StringType(), True) ])), StructField('languages', ArrayType(StringType()), True), StructField('state', StringType(), True), StructField('gender', StringType(), True) ]) df = spark.createDataFrame(data = arrayStructureData, schema = arrayStructureSchema) df.printSchema() df.show(truncate=False) df.filter(df.state == "OH") \ .show(truncate=False) df.filter(col("state") == "OH") \ .show(truncate=False) df.filter("gender == 'M'") \ .show(truncate=False) df.filter( (df.state == "OH") & (df.gender == "M") ) \ .show(truncate=False) df.filter(array_contains(df.languages,"Java")) \ .show(truncate=False) df.filter(df.name.lastname == "Williams") \ .show(truncate=False)
# Step 4: Extract Insights About Cat And Dog Owners # Topic extraction with LDA docs_ddf = LDA_dataset_preparation(pet_owners_ddf.limit(20000)). \ persist(StorageLevel.DISK_ONLY) # docs_ddf.show() topics = TopicExtraction(docs_ddf, topic_num=20) topics.process() # Step 5: Identify Creators With Cat And Dog Owners In The Audience creators_ddf = dataset_ddf.join(pet_owners_ddf, 'userid', 'inner'). \ select('creator_name', pet_owners_ddf['userid'], 'predict_dog_owner', 'predict_cat_owner'). \ groupby('creator_name').sum('predict_dog_owner', 'predict_cat_owner'). \ withColumn('dog_count', col('sum(predict_dog_owner)')). \ withColumn('cat_count', col('sum(predict_cat_owner)')). \ selectExpr('creator_name', 'dog_count', 'cat_count', 'dog_count*dog_count AS dog_count2', 'cat_count*cat_count AS cat_count2') dog_m, dog2, cat_m, cat2 = creators_ddf.groupby().avg( 'dog_count', 'dog_count2', 'cat_count', 'cat_count2').collect()[0] ranking_ddf = creators_ddf. \ withColumn('dog_significance', significance_udf(dog_m, dog2)(col('dog_count'))). \ withColumn('cat_significance', significance_udf(cat_m, cat2)(col('cat_count'))). \ persist(StorageLevel.MEMORY_AND_DISK) print('Top 10 creators with higher number of dog owners')
"kafka.bootstrap.servers", "sandbox-hdp.hortonworks.com:9092").option("subscribe", "ratp-api").load() #On vient recuperer la requete type pour recuperer le schema global json_example = spark.read.json("hdfs:///user/root/test.json", multiLine=True) schema_json = json_example.schema #On recupere les colonnes voulues des messages du producer : value correspond a la reponse, et timestamp a l'heure ou a ete effectue la requete df = kafkaStream.selectExpr("CAST(value as STRING)", "CAST(timestamp as TIMESTAMP)") #Selection de la partie du json qui nous interesse df = df.select( F.from_json(F.col("value"), schema_json).alias("test"), "timestamp" ).select( "test.Siri.ServiceDelivery.StopMonitoringDelivery.MonitoredStopVisit", "timestamp").select(F.explode("MonitoredStopVisit"), "timestamp").select( F.explode("col"), "timestamp").select( "col.MonitoredVehicleJourney", "timestamp", "col.MonitoringRef") #On selectionne les colonnes qui nous interessent df = df.select( "MonitoredVehicleJourney.DestinationName", "MonitoredVehicleJourney.LineRef", "MonitoredVehicleJourney.TrainNumbers.TrainNumberRef", "MonitoredVehicleJourney.MonitoredCall.ExpectedArrivalTime", "MonitoredVehicleJourney.MonitoredCall.StopPointName", "timestamp",
def process_log_data(spark, input_data, output_data): """ Extract log data from JSON files and write data to parquet files on S3 :param spark: The Spark session object :param input_data: Source JSON files on S3 :param output_data: Target S3 bucket where to write Parquet files :return: None """ # get filepath to log data file log_data = os.path.join(input_data, 'log_data/*/*/*.json') # # song data needed for join song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table # users - users in the app # user_id, first_name, last_name, gender, level users_table = df.select( 'userId', 'firstName', 'lastName', 'gender', 'level').where(col("userId").isNotNull()).withColumnRenamed( "userId", "user_id").withColumnRenamed("firstName", "first_name").withColumnRenamed( "lastName", "last_name").distinct() # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: int(x) / 1000) df = df.withColumn('start_time', get_timestamp('ts')) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(int(x) / 1000)) df = df.withColumn('datetime', get_datetime('ts')) # extract columns to create time table # time - timestamps of records in songplays broken down into specific units # start_time, hour, day, week, month, year, weekday time_table = df.select('start_time', 'datetime').withColumn( 'hour', hour('datetime')).withColumn('day', dayofmonth('datetime')).withColumn( 'week', weekofyear('datetime')).withColumn( 'month', month('datetime')).withColumn( 'year', year('datetime')).withColumn( 'weekday', dayofweek('datetime')).distinct() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time.parquet'), 'overwrite') # read in song data to use for songplays table song_df = spark.read.json(song_data).alias('song_df') # extract columns from joined song and log datasets to create songplays table # songplays - records in log data associated with song plays i.e. records with page NextSong # songplay_id, start_time, user_id, level, song_id, artist_id, # session_id, location, user_agent songplays_table = df.join(song_df, col("artist") == col("song_df.artist_name"), 'inner').select( col('start_time'), col('userId').alias('user_id'), col('level'), col('song_df.song_id').alias('song_id'), col('song_df.artist_id').alias('artist_id'), col('sessionId').alias('session_id'), col('location'), col('userAgent').alias('user_agent'), year('datetime').alias('year'), month('datetime').alias('month')).withColumn( 'songplay_id', monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays.parquet'), 'overwrite')
def run_spark_job(spark): # TODO Create Spark Configuration # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers","localhost:9092") \ .option("subscribe","com.udacity.police.calls") \ .option("startingOffsets","earliest") \ .option("maxRatePerPartition",100) \ .option("maxOffsetsPerTrigger",200) \ .option("stopGracefullyOnShutdown","true") \ .load() # Show schema for the incoming resources for checks df.printSchema() # TODO extract the correct column from the kafka input resources # Take only value and convert it to String kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("DF"))\ .select("DF.*") # TODO select original_crime_type_name and disposition # distinct_table = service_table \ # .select(psf.to_timestamp(psf.col("call_date_time")).alias("call_date_time"), # psf.col('original_crime_type_name'), # psf.col('disposition')) distinct_table = service_table \ .select(psf.to_timestamp(psf.col("call_date_time")).alias("call_date_time"), psf.col('original_crime_type_name'), psf.col('disposition')) distinct_table.printSchema() # count the number of original crime type agg_df = distinct_table \ .withWatermark("call_date_time", "60 minutes") \ .groupBy( psf.window(distinct_table.call_date_time, "10 minutes", "5 minutes"), psf.col('original_crime_type_name') ) \ .count() # TODO Q1. Submit a screen shot of a batch ingestion of the aggregation # TODO write output stream query = agg_df \ .writeStream \ .outputMode("complete") \ .format("console") \ .start() # TODO attach a ProgressReporter query.awaitTermination() # TODO get the right radio code json path radio_code_json_filepath = "./radio_code.json" radio_code_df = spark.read.json(radio_code_json_filepath) # clean up your data so that the column names match on radio_code_df and agg_df # we will want to join on the disposition code # TODO rename disposition_code column to disposition radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition") # TODO join on disposition column join_query = agg_df.join(radio_code_df, "disposition") join_query.awaitTermination()
# 1. model_path = path to the pre-trained models. (E.g. path/to/model/bigdl_inception-v1_imagenet_0.4.0.model) # # 2. image_path = path to the folder of the training images. (E.g. path/to/data/dogs-vs-cats/demo/\*/\*) model_path = "hdfs:///user/example/dogscats/bigdl_inception-v1_imagenet_0.4.0.model" image_path = "hdfs:///user/example/dogscats/demo/*/*" imageDF = NNImageReader.readImages(image_path, sc) imageDF.printSchema() getName = udf( lambda row: re.search(r'(cat|dog)\.([\d]*)\.jpg', row[0], re.IGNORECASE). group(0), StringType()) getLabel = udf(lambda name: 1.0 if name.startswith('cat') else 2.0, DoubleType()) labelDF = imageDF.withColumn("name", getName(col("image"))).withColumn( "label", getLabel(col('name'))) (trainingDF, validationDF) = labelDF.randomSplit([0.9, 0.1]) labelDF.select("name", "label").show(10) # ## Fine-tune a pre-trained model # We fine-tune a pre-trained model by removing the last few layers, freezing the first few layers, and adding some new layers. transformer = ChainedPreprocessing([ RowToImageFeature(), ImageResize(256, 256), ImageCenterCrop(224, 224), ImageChannelNormalize(123.0, 117.0, 104.0), ImageMatToTensor(), ImageFeatureToTensor() ])
def join_logs(hive_context, batch_config, interval_time_in_seconds, log_table_names): def union_logs(df_clicklog, df_showlog): # union click log and show log. columns = [ 'did', 'is_click', 'action_time', 'keyword', 'keyword_index', 'media', 'media_category', 'net_type', 'gender', 'age', 'adv_id' ] df_clicklog = df_clicklog.withColumn('is_click', lit(1)) df_clicklog = df_clicklog.select(columns) df_showlog = df_showlog.withColumn('is_click', lit(0)) df_showlog = df_showlog.select(columns) df_unionlog = df_showlog.union(df_clicklog) return df_unionlog def transform_action_time(df_logs, interval_time_in_seconds): _udf_time = udf( lambda x: int( datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').strftime("%s")), IntegerType()) df_logs = df_logs.withColumn('action_time_seconds', _udf_time(col('action_time'))) _udf_interval_time = udf(lambda x: x - x % interval_time_in_seconds, IntegerType()) df_logs = df_logs.withColumn( 'interval_starting_time', _udf_interval_time(col('action_time_seconds'))) return df_logs timer_start = timeit.default_timer() start_date, end_date, load_minutes = batch_config starting_time = datetime.strptime(start_date, "%Y-%m-%d") ending_time = datetime.strptime(end_date, "%Y-%m-%d") showlog_table_name, clicklog_table_name, logs_table_name = log_table_names batched_round = 1 while starting_time < ending_time: batched_time_start_str = starting_time.strftime("%Y-%m-%d %H:%M:%S") batched_time_end = starting_time + \ timedelta(minutes=load_minutes) batched_time_end_str = batched_time_end.strftime("%Y-%m-%d %H:%M:%S") print_batching_info("Main logs", batched_round, batched_time_start_str, batched_time_end_str) command = """select did, action_time, keyword, keyword_index, media, media_category, net_type, gender, age, adv_id from {} where action_time >= '{}' and action_time < '{}'""" df_clicklog_batched = hive_context.sql( command.format(clicklog_table_name, batched_time_start_str, batched_time_end_str)) df_showlog_batched = hive_context.sql( command.format(showlog_table_name, batched_time_start_str, batched_time_end_str)) df_logs_batched = union_logs(df_clicklog_batched, df_showlog_batched) df_logs_batched = transform_action_time(df_logs_batched, interval_time_in_seconds) df_logs_batched = df_logs_batched.withColumn( 'uckey', concat_ws(",", col('media'), col('media_category'), col('net_type'), col('gender'), col('age'))) mode = 'overwrite' if batched_round == 1 else 'append' write_to_table(df_logs_batched, logs_table_name, mode=mode) batched_round += 1 starting_time = batched_time_end timer_end = timeit.default_timer() print('Total batching seconds: ' + str(timer_end - timer_start))
def is_null(df, column): return df.filter(F.col(column).isNull() | F.isnan(column)).count() > 0
def create_column_with_power(df, column_name: str): return df.withColumn("power_2", power_two_UDF(col(column_name)))
df_mongo = sesh.read.json( 'hdfs://ip-{}.ec2.internal:9000/user/ubuntu/metadata/metadata.json'.format( private_ip)) # drop these columns from metadata df_mongo = df_mongo.drop('id')\ .drop('_id')\ .drop('brand')\ .drop('categories')\ .drop('description')\ .drop('related')\ .drop('salesRank')\ .drop('title')\ .drop('imUrl')\ .dropna()\ .withColumn('price', col('price').cast('float')) # make sure prices are positive df_mongo = df_mongo.where(df_mongo.price > 0) # structure of kindle reviews schema = StructType().add('id', IntegerType(), True)\ .add('asin', StringType(), True)\ .add('helpful', StringType(), True)\ .add('overall', IntegerType(), True)\ .add('reviewText', StringType(), True)\ .add('reviewTime', StringType(), True)\ .add('reviewerID', StringType(), True)\ .add('reviewerName', StringType(), True)\ .add('summary', StringType(), True)\ .add('unixReviewTime', IntegerType(), True)
# 1. Calculate L(orig, priv) and H(orig, priv) for # detailed cells, marginals, total (for the queries listed in "queries") df_L1 = queryLp(querydf, 1) df_L2 = queryLp(querydf, 2) df_Linf = queryLp(querydf, "inf") sdftools.show(df_L1, "L^1 norm for the queries") sdftools.show(df_L2, "L^2 norm for the queries") sdftools.show(df_Linf, "L^inf norm for the queries") df_H = queryHellinger(querydf) sdftools.show(df_H, "Hellinger metric for the queries") # 2. Average L^p and H across geounits in the geolevel # removed AC.GEOCODE from the groupby to aggregate across all geounits groupby = [AC.GEOLEVEL, AC.QUERY, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP] df_L1_avg = df_L1.groupBy(groupby).agg(sf.avg(sf.col("L^1_norm"))).persist() df_L2_avg = df_L2.groupBy(groupby).agg(sf.avg(sf.col("L^2_norm"))).persist() df_Linf_avg = df_Linf.groupBy(groupby).agg(sf.avg(sf.col("L^inf_norm"))).persist() sdftools.show(df_L1_avg, "L^1 norm for the queries") sdftools.show(df_L2_avg, "L^2 norm for the queries") sdftools.show(df_Linf_avg, "L^inf norm for the queries") df_H_avg = df_H.groupBy(groupby).agg(sf.avg(sf.col("H"))).persist() sdftools.show(df_H_avg, "Hellinger metric for the queries")
def read_and_agg_gdelt_data(spark): """ Reads the preprocessed gdelt data of stage 1. Creates aggregations by country and date. Creates several metrics, see Readme.md for details. Merging the subregions is very costly and considered for a later step of the project. spark: spark session returns a spark dataframe with aggregated gdelt data """ #read in gdelt data of stage 1 and create temp view df_gdelt = spark.read.parquet(folder_s1 + "gdelt/gdelt.parquet") df_gdelt.createOrReplaceTempView("gdelt") #create df with covid related metrics, grouped by country and date df_gdelt_facts_covid = spark.sql("""SELECT country_code, `date`, COUNT(DISTINCT GLOBALEVENTID) as gd_events_covid, SUM(NumMentions) as gd_nummentions_covid, SUM(NumSources) as gd_numsources_covid, SUM(NumArticles) as gd_numarticles_covid, AVG(AvgTone) as gd_avgtone_covid, AVG(GoldsteinScale) as gd_gtscale_covid FROM gdelt WHERE covid = true GROUP BY country_code, `date` """) #create df with general metrics, grouped by country and date df_gdelt_facts_general = spark.sql("""SELECT country_code, `date`, COUNT(DISTINCT GLOBALEVENTID) as gd_events_general, SUM(NumMentions) as gd_nummentions_general, SUM(NumSources) as gd_numsources_general, SUM(NumArticles) as gd_numarticles_general, AVG(AvgTone) as gd_avgtone_general, AVG(GoldsteinScale) as gd_gtscale_general FROM gdelt GROUP BY country_code, `date` """) #join the two metric frames together df_gdelt_facts = df_gdelt_facts_general \ .join(df_gdelt_facts_covid, \ on=['country_code','date'],how="outer") #calculate proportions of covid metric on general metric df_gdelt_facts = df_gdelt_facts \ .withColumn('gd_events_covid_perc',\ col('gd_events_covid') / col('gd_events_general')) df_gdelt_facts = df_gdelt_facts \ .withColumn('gd_nummentions_covid_perc',\ col('gd_nummentions_covid') / col('gd_nummentions_general')) df_gdelt_facts = df_gdelt_facts \ .withColumn('gd_numsources_covid_perc',\ col('gd_numsources_covid') / col('gd_numsources_general')) df_gdelt_facts = df_gdelt_facts \ .withColumn('gd_numarticles_covid_perc',\ col('gd_numarticles_covid') / col('gd_numarticles_general')) df_gdelt_facts = df_gdelt_facts.withColumnRenamed('country_code', 'regionId') return df_gdelt_facts
def dedupe_splink_scores( df_e_with_dupes: DataFrame, unique_id_colname: str, score_colname: str = None, selection_fn: str = "abs_val", ): """Sometimes, multiple Splink jobs with different blocking rules are combined into a single dataset of edges. Sometimes,the same pair of nodes will be scored multiple times, once by each job. We need to deduplicate this dataset so each pair of nodes appears only once Args: df_e_with_dupes (DataFrame): Dataframe with dupes unique_id_colname (str): Unique id column name e.g. unique_id score_colname (str, optional): Which column contains scores? If none, inferred from df_e_with_dupes.columns. Defaults to None. selection_fn (str, optional): Where we have several different scores for a given pair of records, how do we decide the final score? Options are 'abs_val' and 'mean'. abs_val: Take the value furthest from 0.5 i.e. the value that expresses most certainty mean: Take the mean of all values Defaults to 'abs_val'. """ # Looking in blocking.py, the position of unique ids # (whether they appear in _l or _r) is guaranteed # in blocking outputs so we don't need to worry about # inversions # This is not the case for labelled data - hence the need # _sql_gen_unique_id_keygen to join labels to df_e possible_vals = ["abs_val", "mean"] if selection_fn not in possible_vals: raise ValueError( f"selection function should be in {possible_vals}, you passed {selection_fn}" ) score_colname = _get_score_colname(df_e_with_dupes, score_colname) if selection_fn == "abs_val": df_e_with_dupes = df_e_with_dupes.withColumn( "absval", f.expr(f"0.5 - abs({score_colname})")) win_spec = Window.partitionBy( [f"{unique_id_colname}_l", f"{unique_id_colname}_r"]).orderBy(f.col("absval").desc()) df_e_with_dupes = df_e_with_dupes.withColumn( "ranking", f.row_number().over(win_spec)) df_e = df_e_with_dupes.filter(f.col("ranking") == 1) df_e = df_e.drop("absval") df_e = df_e.drop("ranking") if selection_fn == "mean": win_spec = Window.partitionBy( [f"{unique_id_colname}_l", f"{unique_id_colname}_r"]).orderBy(f.col(score_colname).desc()) df_e_with_dupes = df_e_with_dupes.withColumn( "ranking", f.row_number().over(win_spec)) df_e_with_dupes = df_e_with_dupes.withColumn( score_colname, f.avg(score_colname).over( win_spec.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)), ) df_e = df_e_with_dupes.filter(f.col("ranking") == 1) df_e = df_e.drop("ranking") return df_e
import sys import pyspark.sql.functions as f from pyspark.sql import SparkSession spark = SparkSession.builder.appName("task4c-sql").getOrCreate() df = spark.read.format('csv').options(header='false', inferschema='false') \ .load(sys.argv[1]).na.fill('') data = df.select( df._c20.cast('string').alias('name'), df._c5.cast('DECIMAL(10, 2)').alias('fare')) result = data.groupBy('name').agg(f.sum('fare')) \ .select('name', f.col('sum(fare)').alias('revenue')) \ .sort(f.col('revenue').desc()).limit(10) \ .write.csv('task4c-sql.out', quoteAll=False, header=False, quote='', ignoreTrailingWhiteSpace=False) ''' module load python/gnu/3.6.5 module load spark/2.4.0 rm -rf task4c-sql.out hfs -rm -R task4c-sql.out spark-submit --conf \ spark.pyspark.python=/share/apps/python/3.6.5/bin/python \ task4c-sql.py task1b-sql.out hfs -getmerge task4c-sql.out task4c-sql.out hfs -rm -R task4c-sql.out cat task4c-sql.out '''
from pyspark.sql import functions as F @udf("string") def lat_lng_2_h3(lat, lng, res): import h3 try: result = h3.geo_to_h3(lat, lng, res) return result except: return None # invalid coordinates will result in null index value. taxi_trips = taxi_trips.withColumn( "h3_pickup", lat_lng_2_h3(F.col("pickup_latitude"), F.col("pickup_longitude"), F.lit(10))).withColumn( "h3_dropoff", lat_lng_2_h3(F.col("dropoff_latitude"), F.col("dropoff_longitude"), F.lit(10))) # COMMAND ---------- # DBTITLE 1,View that H3 columns display(taxi_trips) # COMMAND ---------- # DBTITLE 1,Define location for data storage (silver layer) username = "******" #please update with a correct user silver_data_location = f"Users/{username}/geospatial/workshop/data/silver"
# "score":0.0 # }] # } # # (Note: The Redis Source for Kafka has redundant fields zSetEntries and zsetentries, only one should be parsed) # # and create separated fields like this: # +------------+-----+-----------+------------+---------+-----+-----+-----------------+ # | key|value|expiredType|expiredValue|existType| ch| incr| zSetEntries| # +------------+-----+-----------+------------+---------+-----+-----+-----------------+ # |U29ydGVkU2V0| null| null| null| NONE|false|false|[[dGVzdDI=, 0.0]]| # +------------+-----+-----------+------------+---------+-----+-----+-----------------+ # # storing them in a temporary view called RedisSortedSet kafkaRedisDF.withColumn("value", from_json("value", kafkaRedisSchema))\ .select(col('value.existType'), col('value.Ch'),\ col('value.Incr'), col('value.zSetEntries'))\ .createOrReplaceTempView("RedisSortedSet") # TO-DO: execute a sql statement against a temporary view, which statement takes the element field from the 0th element in the array of structs and create a column called encodedCustomer # the reason we do it this way is that the syntax available select against a view is different than a dataframe, and it makes it easy to select the nth element of an array in a sql column zSetEntriesEncodedStreamingDF = spark.sql( "select zSetEntries[0].element as encodedCustomer from RedisSortedSet") # TO-DO: take the encodedCustomer column which is base64 encoded at first like this: # +--------------------+ # | customer| # +--------------------+ # |[7B 22 73 74 61 7...| # +--------------------+
spark = SparkSession.Builder().master("local").appName( "houseprice").getOrCreate() sc = spark.sparkContext train_df = spark.read.csv("/home/luminar/Downloads/train(1).csv", header=True, inferSchema=True) train_df.show() train_df.printSchema() print(train_df.count()) #to find missing values for c in train_df.columns: print(c, train_df.filter(col(c).isNull()).count()) for c in train_df.columns: print(c, train_df.filter(col(c) == "NA").count()) #to drop the columns with missig value greater than 1000 for c in train_df.columns: if train_df.filter(col(c) == "NA").count() > 1000: train_df.drop(c) #MAY 4TH #fill the null values of columns by using when and 0therwise train_df.groupBy("LotFrontage").count().show() train_df = train_df.withColumn( "LotFrontage",