def build_vocabulary(df: pyspark.sql.DataFrame) -> Dict[str, List[Any]]: vocab = {} for col in CATEGORICAL_COLS: values = [r[0] for r in df.select(col).distinct().collect()] col_type = type([x for x in values if x is not None][0]) default_value = col_type() vocab[col] = sorted(values, key=lambda x: x or default_value) return vocab
def normalise_fields_names(df: pyspark.sql.DataFrame, fieldname_normaliser=__normalise_fieldname__): return df.select([ f.col("`{}`".format(field.name)).cast( __rename_nested_field__(field.dataType, fieldname_normaliser)).alias( fieldname_normaliser(field.name)) for field in df.schema.fields ])
def perc_weather_cancellations_per_week(spark: sk.sql.SparkSession, data: sk.sql.DataFrame) -> sk.RDD: onlycancelled = data.select(data['Cancelled'] == 1) codeperweek = data.rdd.map(lambda row: (week_from_row(row), (1, 1 if (str( row['CancellationCode']).strip() == 'B') else 0))) fractioncancelled = codeperweek.reduceByKey(lambda l, r: (l[0] + r[0], l[1] + r[1])) return fractioncancelled.mapValues( lambda v: v[1] / v[0] * 100.0).sortByKey()
def show_df(df: pyspark.sql.DataFrame, columns: list, rows: int = 10, sample=False, truncate=True): """ Prints out number of rows in pyspark df :param df: pyspark dataframe :param columns: list of columns to print :param rows: how many rows to print - default 10 :param sample: should we sample - default False :param truncate: truncate output - default True :return: """ if sample: sample_percent = min(rows / df.count(), 1.0) log.info(f'sampling percentage: {sample_percent}') df.select(columns).sample(False, sample_percent, seed=1).show(rows, truncate=truncate) else: df.select(columns).show(rows, truncate=truncate)
def process_inquiries(self, review: pyspark.sql.DataFrame, metadata: pyspark.sql.DataFrame) -> None: logging.info("Start pipeline") logging.info("Processing") review_transform_date = review.select( 'asin', 'overall', 'unixReviewTime').withColumn("unixReviewTime", from_unixtime("unixReviewTime")) review_date_decompose = review_transform_date.withColumn( "month", month("unixReviewTime")).withColumn("year", year("unixReviewTime")) metadata_flatten_categories = metadata.select( 'asin', explode('categories')).select('asin', explode('col')) join_review_metadata = review_date_decompose.join( metadata_flatten_categories, on=['asin'], how='inner') groupby_review_metadata = join_review_metadata.groupBy( "year", "month", "col").count().orderBy('year', 'month', 'count', ascending=False).cache() patrions = groupby_review_metadata.withColumn( "rank", row_number().over(self.get_partitions())).cache() filter_patrions = patrions.filter(self.patrions.rank <= 5).cache() groupby_review_metadata.unpersist() result_inner = join_review_metadata.join(filter_patrions, on=['year', 'month', 'col'], how='inner') patrions.unpersist() filter_patrions.unpersist() result_groupby = result_inner.groupBy( 'year', 'month', 'col').avg('overall').alias('rating').orderBy('year', 'month', ascending=True) result_groupby.show() logging.info("Finished") self.upsert_database(result_groupby, 'mydb', 'myset')
def flatten(df: pyspark.sql.DataFrame, fieldname_normaliser=__normalise_fieldname__): cols = [] for child in __get_fields_info__(df.schema): if len(child) > 2: ex = "x.{}".format(child[-1]) for seg in child[-2:0:-1]: if seg != '``': ex = "transform(x.{outer}, x -> {inner})".format(outer=seg, inner=ex) ex = "transform({outer}, x -> {inner})".format(outer=child[0], inner=ex) else: ex = ".".join(child) cols.append( f.expr(ex).alias( fieldname_normaliser("_".join(child).replace('`', '')))) return df.select(cols)