def compile_rpad(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) length = op.length.op().value pad = op.pad.op().value return F.rpad(src_column, length, pad)
def _fix_length_of_columns(self): if self.complete_with is None: complete_with = ' ' else: complete_with = str(self.complete_with) df = self.df for column_infos in self.column_list: if not isinstance(column_infos, tuple) and not isinstance( column_infos, list): raise TypeError( "Values inside 'column_list' must be of type 'list' or 'tuple'. Found: '{0}'" .format(type(column_infos))) if len(column_infos) != 2: raise ValueError( "Values inside 'column_list' must be have length of 2 (column_name and column_length)" ) column_name = column_infos[0] column_length = int(column_infos[1]) df = df.withColumn( column_name, rpad( col(column_name).cast('string'), column_length, complete_with)) return df
def evaluate_sic_soc_by_class(code, balanced=False, matched_only=True): """ Creates a classification report showing the results per class of the sic soc coding tool Outputs have been saved on HDFS params: code: {'sic','soc'} balanced: Use validation set with evenly balanced classes, bool, default: False matched_only: Include/Exclude those under the threshold """ if code == 'sic': true_y_label = 'industry_code' pred_y_label = 'SIC2007_prototype' error_codes_list = ['VVVV','XXXX','YYYY','ZZZZ'] elif code == 'soc': true_y_label = 'SOC2020_code' pred_y_label = 'SOC2020_predicted' error_codes_list = None else: raise('Code should be either sic or soc') hdfs_filepath = '/dapsen/workspace_zone/sic_soc/dsc/benchmark/' + code if balanced: hdfs_filepath = hdfs_filepath + '_b' benchmark_df = spark_session.read.parquet(hdfs_filepath) new_true_y_label = true_y_label + '_2' new_pred_y_label = pred_y_label + '_2' benchmark_df = benchmark_df.withColumn(new_true_y_label,F.rpad(F.col(true_y_label),4,'0')) if error_codes_list: benchmark_df = benchmark_df.withColumn(new_true_y_label,F.when(F.col(new_true_y_label).isin(error_codes_list), '-1').otherwise(F.col(new_true_y_label))) benchmark_df = benchmark_df.withColumn(new_pred_y_label, F.when(F.col(pred_y_label).isin(['-6','-1']), '-1').otherwise(F.col(pred_y_label))) if matched_only: benchmark_df = benchmark_df.filter(F.col(new_pred_y_label) != '-1') y_true = np.array(benchmark_df.select(new_true_y_label).collect()) y_pred = np.array(benchmark_df.select(new_pred_y_label).collect()) return evaluation_results(y_true, y_pred)
.load('/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv') #data schema 확인 df.printSchema() #initcap : 주어진 문자열에서 공백을 나눠 첫글자를 대문자로 반환 df.select(initcap(col("Description"))).show(2, False) #lower // upper df.select(lower(col("StockCode"))).show(2) #공백 추가 및 제거 (lit,ltrim,rtrim,rpad,lpad,trim) from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lpad"), rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2) ##정규 표현식 #description컬럼의 값을 COLOR 값으로 치환 from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) #주어진 문자를 다른 문자로 치환 from pyspark.sql.functions import translate df.select(translate(col("Description"), "WHI", "123")).show(2) #color name 추출 from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import lower, upper df.select(col("Description"), lower(col("Description")), upper(lower(col("Description")))).show(2) # COMMAND ---------- from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lp"), rpad(lit("HELLO"), 10, " ").alias("rp")).show(2) # COMMAND ---------- from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) # COMMAND ---------- from pyspark.sql.functions import translate df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\
def collect_orderby(sdf, cols, groupby, orderby=None, suffix='__collect', sep='____', orderby_func={}, dtype=StringType(), ascending=True, drop_null=True): # 暂时不考虑空值填充,orderby=None时将去除空值 """ Paramter: ---------- sdf: pyspark dataframe to be processed cols: str/list of the sdf'cols to be processed groupby: str/list of sdf' cols to be groupbyed when collect_orderby orderby: str/list of sdf' cols to be orderbyed when collect_orderby suffix: str of cols' names converted bycollect_orderby(renamed by cols+suffix) sep: str of the sep when concat_ws(don't change by default) dtype: pyspark.sql.types of the return values Return: ---------- sdf: pyspark dataframe of collect_list orderby Example: ---------- sdf=collect_orderby(sdf,cols,groupby='user_id',orderby='time') """ # cols:需collect_list项 # groupby:为空时可传入[] # orderby:必为string、int、float项(也可有int,float型) assert not orderby_func or orderby orderby_agg_func = [] orderby_agg_cols = [] orderby_copy_cols_dict, orderby_recover_cols_dict = {}, { } # 用于orderby中有非string字段进行collect时的名称统一 if not isinstance(cols, list): cols = [cols] if not isinstance(groupby, list): groupby = [groupby] if orderby is None: orderby = [] orderby_func = {} if not isinstance(orderby, list): orderby = [orderby] # 如果orderby有字段也要进行collect且是非string类型时,需要做一次字段复制,否则会将1变成'01' for i, c in enumerate(orderby): if c in cols and dict(sdf.select(orderby).dtypes)[c] != 'string': c_orderby = f"{c}{sep}orderby" sdf = sdf.withColumn(c_orderby, F.col(c)) orderby[i] = c_orderby orderby_copy_cols_dict[c_orderby] = c if not isinstance(orderby_func, dict): if not isinstance(orderby_func, list): orderby_func = [orderby_func] orderby_func = dict(zip(orderby, [orderby_func] * len(orderby))) if not drop_null: split_udf = F.udf( lambda x: [ i.split(sep)[-1] if len(i.split(sep)) > 1 else None #当原始字段包含sep时,这里将有问题!!!! for i in x ], ArrayType(dtype)) else: split_udf = F.udf( lambda x: [ i.split(sep)[-1] #当原始字段包含sep时,这里将有问题!!!! for i in x if len(i.split(sep)) > 1 ], ArrayType(dtype)) for c in [ k for k, v in sdf.dtypes if k in cols and len(re.findall(re.compile(r'^(array|vector)'), v)) > 0 ]: logstr(f'{c}类型转换为StringType') sdf = sdf.withColumn(c, cast2str_udf(c)) # 不符合要求的先统计转为StringType() logstr('orderby', orderby) if len(orderby) != 0: # 处理orderby_func for c, f_list in orderby_func.items(): if not isinstance(f_list, list): f_list = [f_list] for i, f in enumerate(f_list): if c not in orderby: continue if isinstance(f, str): f = f_list[i] = eval(f"F.{f}") key = f"{c}{sep}{f.__name__}" orderby_agg_func.append(f(c).alias(key)) orderby_agg_cols.append(key) if c in orderby_copy_cols_dict: orderby_recover_cols_dict[ key] = f"{orderby_copy_cols_dict[c]}{sep}{f.__name__}" # 处理非字符型orderby order_int_list = [ k for k, v in sdf.dtypes if k in orderby and len(re.findall(re.compile(r'(int)'), v)) > 0 ] order_float_list = [ k for k, v in sdf.dtypes if k in orderby and len(re.findall(re.compile(r'(float|double)'), v)) > 0 ] if order_int_list: logstr('order_int_list', order_int_list) order_int_max_sdf = sdf.select(order_int_list).agg( *[F.max(c).alias(c) for c in order_int_list]) order_int_max_df = sdf.select(order_int_list).agg( *[F.max(c).alias(c) for c in order_int_list]).toPandas() order_int_max_dict = dict( zip(order_int_max_df.keys(), order_int_max_df.values.flatten().tolist())) logstr('order_int_max_dict', order_int_max_dict) for c in order_int_list: sdf = sdf.withColumn( c, F.lpad( F.col(c).cast(StringType()), len(str(order_int_max_dict[c])), '0')) if order_float_list: logstr('order_float_list', order_float_list) for c in order_float_list: sdf = sdf.withColumn(c, F.col(c).cast(StringType())) max_df = sdf.select(F.split(c, r"\.").alias(c)).select([ F.length(F.col(c)[i]).alias(c + f"__{i}") for i in range(2) ]).agg(*[ F.max(c + f"__{i}").alias(c + f"__{i}") for i in range(2) ]).toPandas() max_dict = dict( zip(max_df.keys(), max_df.values.flatten().tolist())) logstr('max_dict', max_dict) sdf = sdf.withColumn( c, F.lpad( F.col(c).cast(StringType()), max_dict[c + "__0"], '0')).withColumn( c, F.rpad( F.col(c).cast(StringType()), max_dict[c + "__1"], '0')) agg_fun_list = [ F.sort_array(F.collect_list(f"%s{sep}{c}" % '_'.join(orderby)), asc=ascending).alias(c + '_temp') for c in cols ] # 这里对于Null值的处理仍不友好,即空值会以['a',,'b']这种形式给出 sdf = sdf.select([ F.concat_ws(sep, *orderby, c).alias(f"%s{sep}{c}" % '_'.join(orderby)) for c in cols ] + groupby + orderby) sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func)) sdf = sdf.select( [split_udf(c + '_temp').alias(c + suffix) for c in cols] + orderby_agg_cols + groupby) else: agg_fun_list = [F.collect_list(c).alias(c + '_temp') for c in cols] sdf = sdf.select(cols + groupby + orderby) sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func)) sdf = sdf.select([ F.col(c + '_temp').cast(ArrayType(dtype)).alias(c + suffix) for c in cols ] + orderby_agg_cols + groupby) for c1, c2 in orderby_recover_cols_dict.items(): sdf = sdf.withColumnRenamed(c1, c2) return sdf
""" = SELECT Description, lower(Description), upper(lower(Description)) FROM dfTable """ #3 Trim and Pad functions print("3") df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lpad"), rpad(lit("HELLO"), 10, " ").alias("rpad") )\ .show(2) """ SELECT ltrim(" HELLO "), rtrim(" HELLO "), trim(" HELLO "), lpad("HELLO", 3, " "), rpad("HELLO", 3, " ") FROM dfTable """ #4
# approach is that you might have, through your pre-processing data validation stage identified data types most suited # to your dataset. Additionally, you can pass in the lists below from various sources including as parameters # to your pipeline. to_int_cols = [ 'WindSpeed', 'WindDirection', 'WindGust', 'Pressure', 'SignificantWeatherCode' ] to_long_cols = ['ForecastSiteCode', 'Visibility'] to_date_cols = ['ObservationDate'] to_double_cols = ['ScreenTemperature', 'Latitude', 'Longitude'] # the assumption was that time fields in the weather datasets were of int type, and required formatting to # a time format clean_df = clean_df.withColumn( 'ObservationTime', F.lpad(clean_df['ObservationTime'], 4, '0').substr(3, 4)) clean_df = clean_df.withColumn('ObservationTime', F.rpad(clean_df['ObservationTime'], 6, '0')).\ withColumn("ObservationTime", (F.regexp_replace('ObservationTime',"""(\d\d)""", "$1:")).substr(0,8)) # clean_df.select('ObservationTime').distinct().show() # using a cast function from spark to modify the data types for col in clean_df.columns: try: if col in to_int_cols: clean_df = clean_df.withColumn(col, F.col(col).cast('int')) elif col in to_long_cols: clean_df = clean_df.withColumn(col, F.col(col).cast('long')) elif col in to_date_cols: clean_df = clean_df.withColumn(col, F.col(col).cast('date')) elif col in to_double_cols: clean_df = clean_df.withColumn(col, F.col(col).cast('double')) else:
df.selectExpr( 'Description', 'lower(Description)', 'upper(lower(Description))').show(2) # select description, lower(Description), upper(lower(Description)) from dfTable from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(' HELLO ')).alias('ltrim'), rtrim(lit(' HELLO ')).alias('rtrim'), trim(lit(' HELLO ')).alias('trim'), lpad(lit('HELLO'), 3, ' ').alias('lp'), rpad(lit('HELLO'), 10, ' ').alias('rp')).show(2) df.selectExpr( 'ltrim( "HELLO" ) as ltrim', 'rtrim( "HELLO" ) as rtrim', 'trim( "HELLO" )as trim', 'lpad("HELLO", 3, " ") as lp', 'rpad("HELLO", 3, " ")as rp').show(2) # select # ltrim(' HELLO '), # rtrim(' HELLO '), # trim(' HELLO '), # lpad('HELLO', 3, ' '), # rpad('HELLO', 10, ' ')
.option("inferSchema", infer_schema) \ .option("header", first_row_is_header) \ .option("sep", delimiter) \ .load(file_location) display(df) # COMMAND ---------- from pyspark.sql import functions as F df = df.withColumn('swap', F.rand(2586) > 0.45) df = df.withColumn( '_first_name', F.when(F.col('swap'), F.rpad(F.substring(F.col('last_name'), 1, 1), 6, '*')).otherwise( F.rpad(F.substring(F.col('first_name'), 1, 1), 6, '*'))) df = df.withColumn( '_last_name', F.when(F.col('swap'), F.rpad(F.substring(F.col('first_name'), 1, 1), 6, '*')).otherwise( F.rpad(F.substring(F.col('last_name'), 1, 1), 6, '*'))) df = df.withColumn('_address', F.sha2(F.col('address'), 256)) # COMMAND ---------- display(df) # COMMAND ---------- # Create a view or table