def inner(df): window = Window.partitionBy("Estacao").orderBy("Data").rowsBetween( -2, 2) df = (df.withColumn("interp", fun.avg(df[col]).over(window))) df = df.withColumn("interp", fun.round(df["interp"], precision)) df = (df.withColumn(col, fun.coalesce(df[col], df["interp"])).drop("interp")) return df
def get_log_of_grades(self, df): # type: (dataframe) -> dataframe for col in self.grade_cols: df = df.withColumn( col, F.coalesce(F.log(F.lit(1) - F.col(col)), F.lit(self.log_value_for_ones))) return df
def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: column_spec = coalesce(*[ col.get_column_spec(source_df=source_df, current_column=current_column) for col in self.value ]) return column_spec
def apply(self, data, on, to=None): """Apply entity map""" applied = (data.join(self._map, data[on] == self._map[self._from], "left_outer").withColumn( to or on, coalesce(self._map[self._to], data[on])).drop( self._from).drop(self._to)) return applied
def dataB_gross_price(df: DataFrame) -> DataFrame: data_frame = df.withColumn( "gross_price", F.coalesce( F.when((df.gen_ledg == 41000), df.adj_extended_amount).otherwise(0), F.lit(MISSING_NUMBER))) return data_frame
def transform(inc_df: DataFrame, prev_df: DataFrame) -> DataFrame: # calculating the metrics inc_df: DataFrame = inc_df.groupBy('email').count(). \ select(['email', col('count').alias('page_view'), lit(config['process_date']).alias('last_active') ]) # merging the data with historical records df_transformed: DataFrame = inc_df.join(prev_df, inc_df.email == prev_df.email, 'full'). \ select([coalesce(prev_df.email, inc_df.email).alias('email'), (coalesce(prev_df.page_view, lit(0)) + coalesce(inc_df.page_view, lit(0))).alias('page_view'), coalesce(prev_df.created_date, inc_df.last_active).cast('date').alias('created_date'), coalesce(inc_df.last_active, prev_df.last_active).cast('date').alias('last_active') ]) return df_transformed
def weighted_average(c, window, offsets, weights): def value(i): return lag(c, -i).over(window) values = [coalesce(value(i) * w, lit(0)) for i, w in zip(offsets, weights)] return sum(values, lit(0))
def any(self, axis: Union[int, str] = 0) -> bool: """ Return whether any element is True. Returns False unless there at least one element within a series that is True or equivalent (e.g. non-zero or non-empty). Parameters ---------- axis : {0 or 'index'}, default 0 Indicate which axis or axes should be reduced. * 0 / 'index' : reduce the index, return a Series whose index is the original column labels. Examples -------- >>> ks.Series([False, False]).any() False >>> ks.Series([True, False]).any() True >>> ks.Series([0, 0]).any() False >>> ks.Series([0, 1, 2]).any() True >>> ks.Series([False, False, None]).any() False >>> ks.Series([True, False, None]).any() True >>> ks.Series([]).any() False >>> ks.Series([np.nan]).any() False """ if axis not in [0, 'index']: raise ValueError('axis should be either 0 or "index" currently.') sdf = self._kdf._sdf.select(self._scol) col = self._scol # Note that we're ignoring `None`s here for now. # any and every was added as of Spark 3.0 # ret = sdf.select(F.expr("any(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0] # Here we use max as its alternative: ret = sdf.select(F.max(F.coalesce(col.cast('boolean'), F.lit(False)))).collect()[0][0] if ret is None: return False else: return ret
def user_preprocessing(spark: SparkSession, save_path="users"): """ This method will generate user list and its feature. Generated dataframe will contains user_id, birth_year, gender, category_subscribe, subscribe column. :param spark: spark Session """ arena_user_data = load_mysql(spark, "arena_user_data") arena_category_subscribers = load_mysql(spark, "arena_category_subscribers") arena_categories = load_mysql(spark, "arena_categories") arena_user_subscribers = load_mysql(spark, "arena_user_subscribers") arena_user_data = arena_user_data.select("id", "birth_year", "gender") arena_category_subscribers = arena_category_subscribers.select( "user_id", "cat_id").where(col("status") == 1) arena_categories = arena_categories.select("id", "cat_name") arena_user_subscribers = arena_user_subscribers.select( "subscriber_id", "user_id").where(col("status") == 1) cat_name_subs = arena_category_subscribers.alias("a").join( arena_categories.alias("b"), on=col("a.cat_id") == col("b.id"), how="left").select("a.user_id", "b.cat_name") cat_name_subs = cat_name_subs.groupBy("user_id").agg( collect_list("cat_name").cast( StringType()).alias("category_subscribe")) cat_name_subs = cat_name_subs.withColumn( "category_subscribe", regexp_replace(col("category_subscribe"), r"' ", r"'")) cat_name_subs = cat_name_subs.withColumn( "category_subscribe", regexp_replace(col("category_subscribe"), r"[\[\]\']", r"")) cat_name_subs = cat_name_subs.withColumn( "category_subscribe", regexp_replace(col("category_subscribe"), r", ", r",")) subscriber_list = arena_user_subscribers.groupBy("subscriber_id").agg( collect_list("user_id").cast(StringType()).alias("subscribe")) subscriber_list = subscriber_list.withColumn( "subscribe", regexp_replace(col("subscribe"), r"[\[\]\'\s]", r"")) users = arena_user_data.alias("a").join(cat_name_subs.alias("b"), on=col("a.id") == col("b.user_id"), how="left")\ .join(subscriber_list.alias("c"), on=col("a.id") == col("c.subscriber_id"), how="left")\ .select("a.*", "b.category_subscribe", "c.subscribe").orderBy("id") users = users.withColumnRenamed("id", "user_id") users = users.withColumn("category_subscribe", coalesce("category_subscribe")).withColumn( "subscribe", coalesce("subscribe")) save_parquet(users, save_path)
def pyspark(): conf = SparkConf().setAppName("PySparkApp").setMaster("local") #conf = SparkConf() sc = SparkContext(conf=conf) #spark = SparkSession.builder.appName("WordCount").master("local").config(conf = conf).getOrCreate() sqlCtx = SQLContext(sc) df1 = get_features() sdf = sqlCtx.createDataFrame(df1) ops1 = "(price_from + price_to)/2" data = sdf.withColumn("MedianPrice", expr(ops1)) tmp = data.withColumn('final_price', coalesce(data['Price123'], data['MedianPrice'])) finaldata = tmp.drop("price", "disFeature") state = { "VIC": "Victoria", "WA": "Western Australia", "ACT": "Australian Capital Territory", "NT": "Northern Territory", "NSW": "New South Wales", "TAS": "Tasmania" } stateDataP = pd.DataFrame(list(state.items()), columns=["State", "StateName"]) stateDataD = sqlCtx.createDataFrame(stateDataP) data1 = finaldata.join(stateDataD, on=['State'], how='inner') finaldataPD = data1.toPandas() #dataPD["StateName"].unique() sc.stop() finaldataPD['price_to'] = finaldataPD['price_to'].astype(str).astype(float) finaldataPD['Price123'] = finaldataPD['Price123'].astype(str).astype(float) finaldataPD['beds'] = finaldataPD['beds'].astype(str).astype(int) finaldataPD['baths'] = finaldataPD['baths'].astype(str).astype(int) finaldataPD['parking'] = finaldataPD['parking'].astype(str).astype(int) df123 = finaldataPD.copy() df123 = df123.replace({pd.np.nan: None}) #print(df123) return df123
def my_concat(*cols): """Generate a format that allows import a Spark df as a one column txt Parameters ---------- *cols : list columns Returns ------- Spark data_license Data in the format needed to save as a txt """ concat_columns = [] for column in cols[:-1]: concat_columns.append(F.coalesce(column, F.lit("*"))) concat_columns.append(F.lit(" ")) concat_columns.append(F.coalesce(cols[-1], F.lit("*"))) return F.concat(*concat_columns)
def dataB_claims(df: DataFrame) -> DataFrame: # Identify the claims value based on the gen_ledg field value data_frame = df.withColumn( "claims", F.coalesce( F.when(df.gen_ledg.isin([46000, 46400]), df.adj_extended_amount).otherwise(0), F.lit(MISSING_NUMBER))) return data_frame
def cleaning_stages(df, **kwargs): #Cleaning_stages to store stages to clean up the dataset #(filter nulls, impute missing data, wrangling data, etc) #TODO: Check for types of df - has to be spark DF #to_dos = ['drop_cols', 'cast_cols_dtype', 'fill_na', 'impute_cols'] if 'drop_cols' in kwargs.keys(): df = df.drop(*kwargs['drop_cols']) if 'cast_cols_dtype' in kwargs.keys(): df = cast_col_to_types(df, kwargs['cast_cols_dtype'][0], to_type=kwargs['cast_cols_dtype'][1]) if 'fill_na' in kwargs.keys(): df = df.fillna(kwargs['fill_na'][1], subset=kwargs['fill_na'][0]) if 'impute_cols' in kwargs.keys(): for val in kwargs['impute_cols']: to_be_imputed_col = val[ 0] # val[0] is the name of column to be imputed expr_sentence = val[ 1] #val[1] is expression string to fill nulls with col_new_name = "new_" + to_be_imputed_col df = df.withColumn( col_new_name, expr(expr_sentence) ) #val[1] is expression string to fill nulls with filled = "filled_" + to_be_imputed_col df = df.withColumn( filled, coalesce(df[to_be_imputed_col], df[col_new_name])) # Drop the two intermidiate columns then rename imputed col with its original name df = df.drop(col_new_name, to_be_imputed_col) df = df.withColumnRenamed(filled, to_be_imputed_col) if 'rank_cols' in kwargs.keys(): for val in kwargs['rank_cols']: to_be_ranked_col = val[ 0] # val[0] is the name of column to be imputed expr_sentence = val[ 1] #val[1] is expression string to fill nulls with col_new_name = "ranked_" + to_be_ranked_col df = df.withColumn( col_new_name, expr(expr_sentence) ) #val[1] is expression string to fill nulls with if 'convert_cols' in kwargs.keys(): for val in kwargs['convert_cols']: to_be_converted_col = val[0] # name of col to store converted cols expr_sentence = val[1] # expression string to convert cols col_new_name = "converted_" + to_be_converted_col df = df.withColumn(col_new_name, expr(expr_sentence)) return df
def test_auto_mapper_coalesce(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", None), (2, None, "Michael", "1970-02-02"), (3, None, "Michael", None), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns( my_column=A.coalesce( A.column("last_name"), A.column("date_of_birth"), A.text("last_resort") ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["my_column"], coalesce( col("b.last_name"), col("b.date_of_birth"), lit("last_resort").cast(StringType()), ).alias("my_column"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert ( result_df.where("member_id == 1").select("my_column").collect()[0][0] == "Qureshi" ) assert ( result_df.where("member_id == 2").select("my_column").collect()[0][0] == "1970-02-02" ) assert ( result_df.where("member_id == 3").select("my_column").collect()[0][0] == "last_resort" )
def test_auto_mapper_array_multiple_items_with_null( spark_session: SparkSession, ) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df: DataFrame = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(dst2=AutoMapperList(["address1", "address2", None])) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["dst2"], when( array(lit("address1"), lit("address2"), lit(None)).isNotNull(), filter( coalesce(array(lit("address1"), lit("address2"), lit(None)), array()), lambda x: x.isNotNull(), ), ).alias("dst2"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][1] == "address2") assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][1] == "address2")
def multijoin(dfs, on=None, how=None, coalesce=None): """Join multiple dataframes. Args: dfs (list[pyspark.sql.DataFrame]). on: same as ``pyspark.sql.DataFrame.join``. how: same as ``pyspark.sql.DataFrame.join``. coalesce (list[str]): column names to disambiguate by coalescing across the input dataframes. A column must be of the same type across all dataframes that define it; if different types appear coalesce will do a best-effort attempt in merging them. The selected value is the first non-null one in order of appearance of the dataframes in the input list. Default is None - don't coalesce any ambiguous columns. Returns: pyspark.sql.DataFrame or None if provided dataframe list is empty. Example: Assume we have two DataFrames, the first is ``first = [{'id': 1, 'value': None}, {'id': 2, 'value': 2}]`` and the second is ``second = [{'id': 1, 'value': 1}, {'id': 2, 'value': 22}]`` Then collecting the ``DataFrame`` produced by ``multijoin([first, second], on='id', how='inner', coalesce=['value'])`` yields ``[{'id': 1, 'value': 1}, {'id': 2, 'value': 2}]``. """ if not dfs: return None # Go over the input dataframes and rename each to-be-resolved # column to ensure name uniqueness coalesce = set(coalesce or []) renamed_columns = defaultdict(list) for idx, df in enumerate(dfs): for col in df.columns: if col in coalesce: disambiguation = '__{}_{}'.format(idx, col) df = df.withColumnRenamed(col, disambiguation) renamed_columns[col].append(disambiguation) dfs[idx] = df # Join the dataframes joined_df = reduce(lambda x, y: x.join(y, on=on, how=how), dfs) # And coalesce the would-have-been-ambiguities for col, disambiguations in renamed_columns.items(): joined_df = joined_df.withColumn(col, F.coalesce(*disambiguations)) for disambiguation in disambiguations: joined_df = joined_df.drop(disambiguation) return joined_df
def metricSumDimensionOverWeekPerProfileDay(data, needed_dimension_variables, feature_col, sampling_multiplier, days=7, include_day_of_week=False): all_user_days = data.select("id").distinct().crossJoin( data.select("date").distinct()) intermediate_table1 = data.filter(col(feature_col) > 0).select( ["id", "date", feature_col]).distinct() intermediate_table1 = intermediate_table1.alias("intermediate_table") all_user_days = all_user_days.alias("all_user_days") # Augment activity table to include non-active days intermediate_table2 = intermediate_table1.join( all_user_days, ['id', 'date'], 'outer').withColumn( "n_", F.coalesce("intermediate_table." + feature_col, lit(0))).drop(feature_col).withColumnRenamed( "n_", feature_col) if include_day_of_week: intermediate_table2 = intermediate_table2.withColumn( "weekday_" + feature_col, F.when( F.date_format('date', 'u').cast(IntegerType()) <= 5, col(feature_col)).otherwise(0)).withColumn( "weekend_" + feature_col, F.when( F.date_format('date', 'u').cast(IntegerType()) >= 6, col(feature_col)).otherwise(0)) # Calculate active hours for each profile-day windowSpec = Window.partitionBy([intermediate_table2.id]).orderBy( intermediate_table2.date).rowsBetween(1 - days, 0) active_hours_table = intermediate_table2.withColumn( "_temp", F.sum(intermediate_table2[feature_col]).over(windowSpec)) if include_day_of_week: active_hours_table = active_hours_table.withColumn( "_temp_weekday", F.sum(intermediate_table2["weekday_" + feature_col]).over( windowSpec)).withColumn( "_temp_weekend", F.sum(intermediate_table2["weekend_" + feature_col]).over(windowSpec)) return active_hours_table.drop( feature_col, "weekday_" + feature_col, "weekend_" + feature_col, ).withColumnRenamed("_temp", feature_col).withColumnRenamed( "_temp_weekday", "weekday_" + feature_col).withColumnRenamed("_temp_weekend", "weekend_" + feature_col)
def dataB_freight_upcharge(df: DataFrame) -> DataFrame: data_frame = df.withColumn( "freight_upcharge", F.coalesce( F.when( df.charge_desc1.isin({ "ADDED FREIGHT", "FREIGHT ALLOWANCE", "FREIGHT CHARGE", "FREIGHT SURCHARGE" }), df.adj_extended_amount).otherwise(0.0), F.lit(MISSING_NUMBER))) return data_frame
def replace(dataframe: DataFrame, column: str, replace_dict: Dict[str, str]) -> DataFrame: """Replace values of a string column in the dataframe using a dict. Example: >>> from butterfree.extract.pre_processing import replace ... from butterfree.testing.dataframe import ( ... assert_dataframe_equality, ... create_df_from_collection, ... ) >>> from pyspark import SparkContext >>> from pyspark.sql import session >>> spark_context = SparkContext.getOrCreate() >>> spark_session = session.SparkSession(spark_context) >>> input_data = [ ... {"id":1, "type": "a"}, {"id":2, "type": "b"}, {"id":3, "type": "c"} ... ] >>> input_df = create_df_from_collection(input_data, spark_context, spark_session) >>> input_df.collect() [Row(id=1, type='a'), Row(id=2, type='b'), Row(id=3, type='c')] >>> replace_dict = {"a": "type_a", "b": "type_b"} >>> replace(input_df, "type", replace_dict).collect() [Row(id=1, type='type_a'), Row(id=2, type='type_b'), Row(id=3, type='c')] Args: dataframe: data to be transformed. column: string column on the dataframe where to apply the replace. replace_dict: dict with values to be replaced. All mapped values must be string. Returns: Dataframe with column values replaced. """ if not isinstance(dataframe, DataFrame): raise ValueError("dataframe needs to be a Pyspark DataFrame type") if (column not in dict( dataframe.dtypes)) or (dict(dataframe.dtypes)[column] != "string"): raise ValueError( "column needs to be the name of an string column in dataframe") if (not isinstance(replace_dict, dict)) or (not all( isinstance(value, str) for value in chain(*replace_dict.items()))): raise ValueError("replace_dict needs to be a Python dict with " "all keys and values as string values") mapping = create_map( [lit(value) for value in chain(*replace_dict.items())] # type: ignore ) return dataframe.withColumn(column, coalesce(mapping[col(column)], col(column)))
def add_fiscal_year_and_month_abbr( df, date_fmt: str = 'yyyy/MM/dd', filter_column_year: str = 'voucher_creation_date', filter_column_month: str = 'shipment_pickup_date') -> DataFrame: expr_mapping = { '_fiscal_year': (F.coalesce( F.year(F.add_months(F.to_date(filter_column_year, date_fmt), 3)), F.year(F.add_months(F.to_date(filter_column_month, date_fmt), 3)))), '_month_abbr': (F.coalesce( F.upper( F.date_format(F.to_date(filter_column_year, date_fmt), 'MMM')), F.upper( F.date_format(F.to_date(filter_column_month, date_fmt), 'MMM')))) } select_expr = build_col_expr(expr_mapping) transformed = df.select(F.expr('*'), *select_expr) return transformed
def combine_key_tables( left: DataFrame, right: DataFrame ) -> DataFrame: return ( left.join( right, left[keys[0]] == right[keys[0]], how='full' ).select( *[ f.coalesce(left[key], right[key]).alias(key) for key in keys ], f.concat( f.coalesce(left.key_source, f.array()), f.coalesce(right.key_source, f.array()) ).alias('key_source') ) )
def to_date_(col): '''Convert multiple date formats from string''' formats = [ 'yyyy-M-d', 'yyyy M d', 'M/dd/yyyy', 'yyyy MMM d', 'M-d-yyyy', ] return F.coalesce(*[F.to_date(col, f) for f in formats])
def _get_telemetry_sanity_check_metrics(self, enrollments, df): """Return aggregations that check for problems with a client.""" return [ # Check to see whether the client_id is also enrolled in other branches # E.g. indicates cloned profiles. Fraction of such users should be # small, and similar between branches. F.max( F.coalesce((df.experiments[self.experiment_slug] != enrollments.branch).astype('int'), F.lit(0))).alias('has_contradictory_branch'), # Check to see whether the client_id was sending data in the conversion # window that wasn't tagged as being part of the experiment. Indicates # either a client_id clash, or the client unenrolling. Fraction of such # users should be small, and similar between branches. F.max( F.coalesce( (~F.isnull(df.experiments) & F.isnull( df.experiments[self.experiment_slug])).astype('int'), F.lit(0))).alias('has_non_enrolled_data'), ]
def dataB_msf(df: DataFrame) -> DataFrame: df = df.withColumn( "msf", F.when(((df.sq_ft.isNotNull()) & (df.sq_ft > 0)), (df.sq_ft / 1000.0)).otherwise( F.when(((df.lbs.isNotNull()) & (df.dmat_nominal_basis_weight.isNotNull() | (df.dmat_nominal_basis_weight != 0))), df.lbs / df.dmat_nominal_basis_weight).otherwise( F.lit(MISSING_NUMBER)))) df = df.withColumn('msf', F.coalesce(df.msf, F.lit(MISSING_NUMBER))) return df
def run(self, df): """ Process and join the data as desired. Key functionalities: - Check for erroneous data - Support Update/Insert - Restartable if the job fails Arguments: df {Spark.DataFrame} -- Dataframe that is the result of a left join of the three disparate data sources for the ticket sales. """ self.logger.info('Data Processing Start // {}'.format(self.etl_id)) try: test_df = df.na.drop() test_df.count() == df.count() jdbc_options = self._inst_jdbc_params() jdbc_options['dbtable'] = 'final_data' target_df = self.sqlcontext.read.format('jdbc').options( **jdbc_options).load() coalesce_cols = [ column for column in target_df.columns if column not in ['transaction_id', 'process_date'] ] param_df = df df = df.alias('a').join( target_df.alias('b'), ['transaction_id'], how='outer').select( 'transaction_id', *(F.coalesce('b.' + col, 'a.' + col).alias(col) for col in coalesce_cols)).distinct() insert_row_count = (df - param_df) if insert_row_count > 0: self.logger.info( 'Inserting {} new rows in target dataframe // {}'.format( insert_row_count, self.etl_id)) self.logger.info('Data Processing Complete // {}'.format( self.etl_id)) test_df.unpersist() return df except Exception as e: self.logger.error('{} // {}'.format(e, self.etl_id))
def hdfs_join_cms(cms_df): """ 解析app_open与game_open日志内容,并与cms数据库连接 """ ##hdfs_df sql = """ select t0.custom_uuid,t0.date,t0.package_id,t0.title,t0.site,t0.source from sharp.app_open t0 where t0.dt="{date_0}" union all select t1.custom_uuid,t1.date,t1.package_id,t1.title,t1.site,t1.source from sharp.app_open t1 where t1.dt="{date_1}" union all select t7.custom_uuid,t7.date,t7.package_id,t7.title,t7.site,t7.source from sharp.app_open t7 where t7.dt="{date_7}" """.format( date_0=str_dt_0, date_1=str_dt_1, date_7=str_dt_7) spark.sql("show databases") spark.sql("use sharp") hdfs_df = spark.sql(sql) ##与CMS数据库应用&游戏数据连接 condition_0_1 = (F.coalesce(F.col("t_0.package_id"), F.lit("123")) == F.coalesce( F.col("t_1.fsk_pid"), F.lit("123"))) df = hdfs_df.alias("t_0").join(cms_df.alias("t_1"),condition_0_1,"left_outer") \ .select(F.col("t_0.custom_uuid").alias("custom_uuid"),F.col("t_0.date").alias("date"),F.col("t_0.site").alias("site"),F.col("t_0.package_id").alias("package_id"), \ F.col("t_0.title").alias("title"),F.col("t_0.source").alias("source"),F.col("t_1.fsk_cid").alias("fsk_cid")) return df
def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: # if column is not of type date then convert it to date formats_column_specs: List[Column] = [ to_timestamp(self.value.get_column_spec( source_df=source_df, current_column=current_column), format=format_) for format_ in self.formats ] if self.formats else [ to_timestamp( self.value.get_column_spec(source_df=source_df, current_column=current_column)) ] if source_df is not None and isinstance(self.value, AutoMapperDataTypeColumn) \ and not dict(source_df.dtypes)[self.value.value] == "timestamp": return coalesce(*formats_column_specs) elif isinstance(self.value, AutoMapperDataTypeLiteral): return coalesce(*formats_column_specs) else: column_spec = self.value.get_column_spec( source_df=source_df, current_column=current_column) return column_spec
def test_coalesce(data_gen): num_cols = 20 s1 = gen_scalar(data_gen, force_no_nulls=not isinstance(data_gen, NullGen)) # we want lots of nulls gen = StructGen([('_c' + str(x), data_gen.copy_special_case(None, weight=1000.0)) for x in range(0, num_cols)], nullable=False) command_args = [f.col('_c' + str(x)) for x in range(0, num_cols)] command_args.append(s1) data_type = data_gen.data_type assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, gen).select( f.coalesce(*command_args)))
def lsg_sales(self, prod_list, coupons): start_date, end_date = date_period(self.period, self.start_date) # Check bound date table_name = 'cdwds.lsg_f_sls_invc' dt_col_name = 'invc_dt_key' _, bound_end_date = date_period(-1, end_date) bound_date_check(table_name, dt_col_name, start_date, bound_end_date, self.env, 'YYYYMMDD', 'LSG') query = 'SELECT '\ 'UPPER(prod_prc_ref_sku) AS prod_id, sum(ext_net_sls_pmar_amt) AS sales ' \ 'FROM cdwds.lsg_f_sls_invc I' \ 'LEFT JOIN cdwds.lsg_prod_v P ON P.sku = prod_prc_ref_sku ' \ f'WHERE invc_dt_key<{start_date} AND invc_dt_key>={end_date} ' \ 'AND UPPER(prod_prc_ref_sku) IS NOT NULL ' \ "AND P.stk_type_cd <> 'D' " \ f'GROUP BY UPPER(prod_prc_ref_sku)' sales = redshift_cdw_read(query, db_type='RS', database='CDWDS', env=self.env) if prod_list: print(f'There are {prod_list.count()} products.') sales = sales.\ join(broadcast(prod_list), ['prod_id'], how='inner') else: print('Product list is not defined for pulling sales.') if coupons: coupons_count = coupons.select("coupon_key").distinct().count() print(f'There are {coupons_count} rows in coupons table.') sales = sales. \ join(broadcast(coupons), ['prod_id'], how = 'left'). \ withColumn('coupon', coalesce('coupon', 'prod_id')) else: print('Coupons is not defined for pulling sales.') coupon_sales = sales.groupby('coupon', 'coupon_key').agg({'sales': 'sum'}). \ withColumnRenamed('sum(sales)', 'coupon_sales'). \ filter(col('coupon_sales') > 0) if sales.count() == 0: raise OutputOutOfBoundError( 'Sales count is 0. Check the data validity of cdwds.lsg_f_sls_invc.' ) if self.debug: print(f'Total rows in SKU sales count: {sales.count()}') print( f'Total number of coupons with sales: {coupon_sales.count()}') return sales, coupon_sales
def test_auto_mapper_date_format(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01 12:30"), (2, "Vidal", "Michael", "1970-02-02 06:30"), ], ["member_id", "last_name", "first_name", "opening_time"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn( "opening_time", to_timestamp("opening_time", format="yyyy-MM-dd hh:mm")) assert dict(source_df.dtypes)["opening_time"] == "timestamp" df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"]).columns(openingTime=A.datetime( A.column("opening_time")).to_date_format("hh:mm:ss")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["openingTime"], date_format(coalesce(to_timestamp(col("b.opening_time"))), "hh:mm:ss").alias("openingTime"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("openingTime").collect() [0][0] == "12:30:00") assert (result_df.where("member_id == 2").select("openingTime").collect() [0][0] == "06:30:00") # check type assert dict(result_df.dtypes)["openingTime"] == "string"
cleanDateDF = spark.range(1).select( to_date(lit("2017-12-11"), dateFormat).alias("date"), to_date(lit("2017-20-12"), dateFormat).alias("date2")) cleanDateDF.createOrReplaceTempView("dateTable2") # COMMAND ---------- from pyspark.sql.functions import to_timestamp cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show() # COMMAND ---------- from pyspark.sql.functions import coalesce df.select(coalesce(col("Description"), col("CustomerId"))).show() # COMMAND ---------- df.na.drop("all", subset=["StockCode", "InvoiceNo"]) # COMMAND ---------- df.na.fill("all", subset=["StockCode", "InvoiceNo"]) # COMMAND ---------- fill_cols_vals = {"StockCode": 5, "Description" : "No Value"}
res['name'] = book res['text'] = "\n".join(['<div class="page-break" page="%d">%s</div>' % (r.seq, r.text) for r in pp]) + ('<archiveid tokenizetagcontent="false">%s</archiveid>' % book) return Row(**res) if __name__ == "__main__": if len(sys.argv) < 3: print("Usage: pretty-cluster.py <input> <page-out> <book-out>", file=sys.stderr) exit(-1) sc = SparkContext(appName="Proteus Pages") sqlContext = SQLContext(sc) raw = sqlContext.read.load(sys.argv[1]) cols = set(raw.columns) idcols = [col(x) for x in ['identifier', 'issue', 'book'] if x in cols] df = raw.withColumn('identifier', regexp_replace(coalesce(*idcols), '[^A-Za-z0-9]+', '')) counts = df.groupBy('identifier').count().select(col('identifier'), col('count').alias('imagecount')) appendID = udf(lambda book, text: '%s <archiveid tokenizetagcontent="false">%s</archiveid>' % (text, book)) renamed = df.join(counts, 'identifier')\ .drop('regions')\ .withColumn('pageNumber', col('seq'))\ .withColumn('name', concat_ws('_', col('identifier'), col('seq')))\ .withColumn('text', regexp_replace(col('text'), '\\n', '<br>\\\n')) renamed.withColumn('text', appendID(col('identifier'), col('text')))\ .write.format('json').save(sys.argv[2]) renamed.rdd.groupBy(lambda r: r.identifier).map(pageCat).toDF()\
def merge(self, df): """ Combines the Data Grid with a given DataFrame. The result is similar to a full outer join, except that when there is no match for a given row, the row is created anyway with all other columns set to NULL. Assuming these schemas: DataGrid: dg[a,b,c] DataFrame: df[b,c,d,e] This is equivalent of executing the following query after adding new columns to the original DataGrid: SELECT a, b, e, COALESCE(dg.b,df.b) as b, COALESCE(dg.c,df.c) as c FROM df FULL OUTER JOIN df ON 1=1 AND dg.b = df.b AND dg.c = df.c """ dg = self.dataframe # Get information about columns and computes common # and different column sets between the DataFrame and DataGrid. dg_columns = dg.columns df_columns = df.columns common_columns = list(set(dg_columns) & set(df_columns)) # intersect # Merge the new DataFrame with the current DataGrid if not dg_columns: # Use the given DataFrame as default DataGrid #self.dataframe = df #self.setDataFrame(df) dg = df #self.index() else: all_columns = list(set(dg_columns + df_columns)) # union new_columns = list(set(df_columns) - set(dg_columns)) # diff diff_columns = list(set(all_columns) - set(common_columns)) # diff # Merge Columns if not common_columns: # Add all columns from the given DataFrame that do not exist yet # in the DataGrid and initialize them with NULL values. common_columns = df_columns diff_columns = dg_columns for c in new_columns: dg = dg.withColumn(c, lit(None).cast(NullType())) # Rename DataFrame's columns that are shared with the DataGrid condition = [] for c in common_columns: df = df.withColumnRenamed(c, 'df_'+c) condition.append(dg[c] == df['df_'+c]) # Join DataFrames dg = dg.join(df, condition, 'outer') for c in common_columns: dg = dg.withColumn(c, coalesce(c, "df_"+c)) dg = dg.drop("df_"+c) #self.dataframe = dg #self.setDataFrame(dg) #self.index() if config.DEBUG: # Debugging: Print out the equivalent SQL Query select_stmt = ' SELECT ' from_stmt = ' FROM dg' join_stmt = ' FULL OUTER JOIN df ON 1=1' # Append non-common columns to select statement for i in range(0,len(diff_columns)): if i!=0: select_stmt += ', ' select_stmt += diff_columns[i] # Append common columns with COALESCE function to select statement # and add conditions to the join predicate for i in range(0,len(common_columns)): if i!=0 or len(diff_columns)>0: select_stmt += ', ' c = common_columns[i] select_stmt += 'COALESCE(dg.'+c+', df.'+c+') AS '+c join_stmt += ' AND dg.'+c+'=df.'+c query = select_stmt + from_stmt + join_stmt print query # Column information print 'dg_columns: ' + dg_columns.__repr__() print 'df_columns: ' + df_columns.__repr__() print 'all_columns: ' + all_columns.__repr__() print 'new_columns: ' + new_columns.__repr__() print 'common_columns: ' + common_columns.__repr__() print 'diff_columns: ' + diff_columns.__repr__() #self.dataframe.cache() return dg