Ejemplo n.º 1
0
def compile_rpad(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    length = op.length.op().value
    pad = op.pad.op().value
    return F.rpad(src_column, length, pad)
    def _fix_length_of_columns(self):

        if self.complete_with is None:
            complete_with = ' '
        else:
            complete_with = str(self.complete_with)

        df = self.df

        for column_infos in self.column_list:

            if not isinstance(column_infos, tuple) and not isinstance(
                    column_infos, list):
                raise TypeError(
                    "Values inside 'column_list' must be of type 'list' or 'tuple'. Found: '{0}'"
                    .format(type(column_infos)))

            if len(column_infos) != 2:
                raise ValueError(
                    "Values inside 'column_list' must be have length of 2 (column_name and column_length)"
                )

            column_name = column_infos[0]
            column_length = int(column_infos[1])

            df = df.withColumn(
                column_name,
                rpad(
                    col(column_name).cast('string'), column_length,
                    complete_with))

        return df
def evaluate_sic_soc_by_class(code, balanced=False, matched_only=True):
    """
    Creates a classification report showing the results per class of the sic soc coding tool
    Outputs have been saved on HDFS
    
    params:
    code: {'sic','soc'}
    balanced: Use validation set with evenly balanced classes, bool, default: False
    matched_only: Include/Exclude those under the threshold
    """
    
    if code == 'sic':
        true_y_label = 'industry_code'
        pred_y_label = 'SIC2007_prototype'
        error_codes_list = ['VVVV','XXXX','YYYY','ZZZZ']
    elif code == 'soc':
        true_y_label = 'SOC2020_code'
        pred_y_label = 'SOC2020_predicted'
        error_codes_list = None
    else:
        raise('Code should be either sic or soc')
    hdfs_filepath = '/dapsen/workspace_zone/sic_soc/dsc/benchmark/' + code
    if balanced:
        hdfs_filepath = hdfs_filepath + '_b'
        
    benchmark_df = spark_session.read.parquet(hdfs_filepath)
    
    new_true_y_label = true_y_label + '_2'
    new_pred_y_label = pred_y_label + '_2'
    benchmark_df = benchmark_df.withColumn(new_true_y_label,F.rpad(F.col(true_y_label),4,'0'))
    if error_codes_list:
        benchmark_df = benchmark_df.withColumn(new_true_y_label,F.when(F.col(new_true_y_label).isin(error_codes_list), '-1').otherwise(F.col(new_true_y_label)))
    benchmark_df = benchmark_df.withColumn(new_pred_y_label, F.when(F.col(pred_y_label).isin(['-6','-1']), '-1').otherwise(F.col(pred_y_label)))
    
    if matched_only:
        benchmark_df = benchmark_df.filter(F.col(new_pred_y_label) != '-1')
    y_true = np.array(benchmark_df.select(new_true_y_label).collect())
    y_pred = np.array(benchmark_df.select(new_pred_y_label).collect())
    
    return evaluation_results(y_true, y_pred)
  .load('/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv')
#data schema 확인
df.printSchema()

#initcap : 주어진 문자열에서 공백을 나눠 첫글자를 대문자로 반환
df.select(initcap(col("Description"))).show(2, False)
#lower // upper
df.select(lower(col("StockCode"))).show(2)
#공백 추가 및 제거 (lit,ltrim,rtrim,rpad,lpad,trim)
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("   HELLO   ")).alias("ltrim"),
    rtrim(lit("   HELLO   ")).alias("rtrim"),
    trim(lit("   HELLO   ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lpad"),
    rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2)

##정규 표현식
#description컬럼의 값을 COLOR 값으로 치환
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    regexp_replace(col("Description"), regex_string,
                   "COLOR").alias("color_clean"), col("Description")).show(2)

#주어진 문자를 다른 문자로 치환
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "WHI", "123")).show(2)

#color name 추출
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------

from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
  regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
  col("Description")).show(2)


# COMMAND ----------

from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\
Ejemplo n.º 6
0
def collect_orderby(sdf,
                    cols,
                    groupby,
                    orderby=None,
                    suffix='__collect',
                    sep='____',
                    orderby_func={},
                    dtype=StringType(),
                    ascending=True,
                    drop_null=True):
    # 暂时不考虑空值填充,orderby=None时将去除空值
    """
    Paramter:
    ----------
    sdf: pyspark dataframe to be processed
    cols: str/list of the sdf'cols to be processed
    groupby: str/list of sdf' cols to be groupbyed when collect_orderby
    orderby: str/list of sdf' cols to be orderbyed when collect_orderby
    suffix: str of cols' names converted bycollect_orderby(renamed by cols+suffix)
    sep: str of the sep when concat_ws(don't change by default)
    dtype: pyspark.sql.types of the return values
    Return:
    ----------
    sdf: pyspark dataframe of collect_list orderby
    Example:
    ----------
    sdf=collect_orderby(sdf,cols,groupby='user_id',orderby='time')
    """
    # cols:需collect_list项
    # groupby:为空时可传入[]
    # orderby:必为string、int、float项(也可有int,float型)
    assert not orderby_func or orderby
    orderby_agg_func = []
    orderby_agg_cols = []
    orderby_copy_cols_dict, orderby_recover_cols_dict = {}, {
    }  # 用于orderby中有非string字段进行collect时的名称统一
    if not isinstance(cols, list):
        cols = [cols]
    if not isinstance(groupby, list):
        groupby = [groupby]
    if orderby is None:
        orderby = []
        orderby_func = {}
    if not isinstance(orderby, list):
        orderby = [orderby]
    # 如果orderby有字段也要进行collect且是非string类型时,需要做一次字段复制,否则会将1变成'01'
    for i, c in enumerate(orderby):
        if c in cols and dict(sdf.select(orderby).dtypes)[c] != 'string':
            c_orderby = f"{c}{sep}orderby"
            sdf = sdf.withColumn(c_orderby, F.col(c))
            orderby[i] = c_orderby
            orderby_copy_cols_dict[c_orderby] = c
    if not isinstance(orderby_func, dict):
        if not isinstance(orderby_func, list):
            orderby_func = [orderby_func]
        orderby_func = dict(zip(orderby, [orderby_func] * len(orderby)))
    if not drop_null:
        split_udf = F.udf(
            lambda x: [
                i.split(sep)[-1]
                if len(i.split(sep)) > 1 else None  #当原始字段包含sep时,这里将有问题!!!!
                for i in x
            ],
            ArrayType(dtype))
    else:
        split_udf = F.udf(
            lambda x: [
                i.split(sep)[-1]  #当原始字段包含sep时,这里将有问题!!!!
                for i in x if len(i.split(sep)) > 1
            ],
            ArrayType(dtype))
    for c in [
            k for k, v in sdf.dtypes if k in cols
            and len(re.findall(re.compile(r'^(array|vector)'), v)) > 0
    ]:
        logstr(f'{c}类型转换为StringType')
        sdf = sdf.withColumn(c, cast2str_udf(c))  # 不符合要求的先统计转为StringType()
    logstr('orderby', orderby)
    if len(orderby) != 0:
        # 处理orderby_func
        for c, f_list in orderby_func.items():
            if not isinstance(f_list, list):
                f_list = [f_list]
            for i, f in enumerate(f_list):
                if c not in orderby:
                    continue
                if isinstance(f, str):
                    f = f_list[i] = eval(f"F.{f}")
                key = f"{c}{sep}{f.__name__}"
                orderby_agg_func.append(f(c).alias(key))
                orderby_agg_cols.append(key)
                if c in orderby_copy_cols_dict:
                    orderby_recover_cols_dict[
                        key] = f"{orderby_copy_cols_dict[c]}{sep}{f.__name__}"
        # 处理非字符型orderby
        order_int_list = [
            k for k, v in sdf.dtypes
            if k in orderby and len(re.findall(re.compile(r'(int)'), v)) > 0
        ]
        order_float_list = [
            k for k, v in sdf.dtypes if k in orderby
            and len(re.findall(re.compile(r'(float|double)'), v)) > 0
        ]
        if order_int_list:
            logstr('order_int_list', order_int_list)
            order_int_max_sdf = sdf.select(order_int_list).agg(
                *[F.max(c).alias(c) for c in order_int_list])
            order_int_max_df = sdf.select(order_int_list).agg(
                *[F.max(c).alias(c) for c in order_int_list]).toPandas()
            order_int_max_dict = dict(
                zip(order_int_max_df.keys(),
                    order_int_max_df.values.flatten().tolist()))
            logstr('order_int_max_dict', order_int_max_dict)
            for c in order_int_list:
                sdf = sdf.withColumn(
                    c,
                    F.lpad(
                        F.col(c).cast(StringType()),
                        len(str(order_int_max_dict[c])), '0'))
        if order_float_list:
            logstr('order_float_list', order_float_list)
            for c in order_float_list:
                sdf = sdf.withColumn(c, F.col(c).cast(StringType()))
                max_df = sdf.select(F.split(c, r"\.").alias(c)).select([
                    F.length(F.col(c)[i]).alias(c + f"__{i}") for i in range(2)
                ]).agg(*[
                    F.max(c + f"__{i}").alias(c + f"__{i}") for i in range(2)
                ]).toPandas()
                max_dict = dict(
                    zip(max_df.keys(),
                        max_df.values.flatten().tolist()))
                logstr('max_dict', max_dict)
                sdf = sdf.withColumn(
                    c,
                    F.lpad(
                        F.col(c).cast(StringType()), max_dict[c + "__0"],
                        '0')).withColumn(
                            c,
                            F.rpad(
                                F.col(c).cast(StringType()),
                                max_dict[c + "__1"], '0'))
        agg_fun_list = [
            F.sort_array(F.collect_list(f"%s{sep}{c}" % '_'.join(orderby)),
                         asc=ascending).alias(c + '_temp') for c in cols
        ]
        # 这里对于Null值的处理仍不友好,即空值会以['a',,'b']这种形式给出
        sdf = sdf.select([
            F.concat_ws(sep, *orderby, c).alias(f"%s{sep}{c}" %
                                                '_'.join(orderby))
            for c in cols
        ] + groupby + orderby)
        sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func))
        sdf = sdf.select(
            [split_udf(c + '_temp').alias(c + suffix)
             for c in cols] + orderby_agg_cols + groupby)
    else:
        agg_fun_list = [F.collect_list(c).alias(c + '_temp') for c in cols]
        sdf = sdf.select(cols + groupby + orderby)
        sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func))
        sdf = sdf.select([
            F.col(c + '_temp').cast(ArrayType(dtype)).alias(c + suffix)
            for c in cols
        ] + orderby_agg_cols + groupby)
    for c1, c2 in orderby_recover_cols_dict.items():
        sdf = sdf.withColumnRenamed(c1, c2)
    return sdf
Ejemplo n.º 7
0
""" =
SELECT 
    Description,
    lower(Description),
    upper(lower(Description))
FROM 
    dfTable
"""

#3 Trim and Pad functions
print("3")
df.select(
    ltrim(lit(" HELLO ")).alias("ltrim"),
    rtrim(lit(" HELLO ")).alias("rtrim"),
    trim(lit(" HELLO ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lpad"),
    rpad(lit("HELLO"), 10, " ").alias("rpad")
)\
.show(2)
"""
SELECT 
    ltrim(" HELLO "),
    rtrim(" HELLO "),
    trim(" HELLO "),
    lpad("HELLO", 3, " "),
    rpad("HELLO", 3, " ")
FROM 
    dfTable
"""

#4
Ejemplo n.º 8
0
# approach is that you might have, through your pre-processing data validation stage identified data types most suited
# to your dataset. Additionally, you can pass in the lists below from various sources including as parameters
# to your pipeline.
to_int_cols = [
    'WindSpeed', 'WindDirection', 'WindGust', 'Pressure',
    'SignificantWeatherCode'
]
to_long_cols = ['ForecastSiteCode', 'Visibility']
to_date_cols = ['ObservationDate']
to_double_cols = ['ScreenTemperature', 'Latitude', 'Longitude']
# the assumption was that time fields in the weather datasets were of int type, and required formatting to
# a time format
clean_df = clean_df.withColumn(
    'ObservationTime',
    F.lpad(clean_df['ObservationTime'], 4, '0').substr(3, 4))
clean_df = clean_df.withColumn('ObservationTime', F.rpad(clean_df['ObservationTime'], 6, '0')).\
    withColumn("ObservationTime", (F.regexp_replace('ObservationTime',"""(\d\d)""", "$1:")).substr(0,8))

# clean_df.select('ObservationTime').distinct().show()
# using a cast function from spark to modify the data types
for col in clean_df.columns:
    try:
        if col in to_int_cols:
            clean_df = clean_df.withColumn(col, F.col(col).cast('int'))
        elif col in to_long_cols:
            clean_df = clean_df.withColumn(col, F.col(col).cast('long'))
        elif col in to_date_cols:
            clean_df = clean_df.withColumn(col, F.col(col).cast('date'))
        elif col in to_double_cols:
            clean_df = clean_df.withColumn(col, F.col(col).cast('double'))
        else:
Ejemplo n.º 9
0
df.selectExpr(
        'Description',
        'lower(Description)',
        'upper(lower(Description))').show(2)

# select description, lower(Description), upper(lower(Description)) from dfTable


from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim

df.select(
        ltrim(lit('         HELLO           ')).alias('ltrim'),
        rtrim(lit('         HELLO           ')).alias('rtrim'),
        trim(lit('         HELLO           ')).alias('trim'),
        lpad(lit('HELLO'), 3, ' ').alias('lp'),
        rpad(lit('HELLO'), 10, ' ').alias('rp')).show(2)

df.selectExpr(
        'ltrim(         "HELLO"           ) as ltrim',
        'rtrim(         "HELLO"           ) as rtrim',
        'trim(         "HELLO"           )as trim',
        'lpad("HELLO", 3, " ") as lp',
        'rpad("HELLO", 3, " ")as rp').show(2)

# select 
#   ltrim('     HELLO       '),
#   rtrim('     HELLO       '),
#   trim('      HELLO       '),
#   lpad('HELLO', 3, ' '),
#   rpad('HELLO', 10, ' ')
Ejemplo n.º 10
0
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

# COMMAND ----------

from pyspark.sql import functions as F
df = df.withColumn('swap', F.rand(2586) > 0.45)

df = df.withColumn(
    '_first_name',
    F.when(F.col('swap'),
           F.rpad(F.substring(F.col('last_name'), 1, 1), 6, '*')).otherwise(
               F.rpad(F.substring(F.col('first_name'), 1, 1), 6, '*')))
df = df.withColumn(
    '_last_name',
    F.when(F.col('swap'),
           F.rpad(F.substring(F.col('first_name'), 1, 1), 6, '*')).otherwise(
               F.rpad(F.substring(F.col('last_name'), 1, 1), 6, '*')))
df = df.withColumn('_address', F.sha2(F.col('address'), 256))

# COMMAND ----------

display(df)

# COMMAND ----------

# Create a view or table