def generate_age_information(self, dcc_experiment_df: DataFrame, mice_df: DataFrame): experiment_df_a = dcc_experiment_df.alias("exp") mice_df_a = mice_df.alias("mice") dcc_experiment_df = experiment_df_a.join( mice_df_a, (experiment_df_a["specimenID"] == mice_df["_specimenID"]) & (experiment_df_a["_centreID"] == mice_df["_centreID"]), "left_outer", ) dcc_experiment_df = self._add_special_dates(dcc_experiment_df) dcc_experiment_df = dcc_experiment_df.withColumn( "ageInDays", datediff( when( col("_dateOfBloodCollection").isNotNull(), col("_dateOfBloodCollection"), ).when( col("_dateOfSacrifice").isNotNull(), col("_dateOfSacrifice")).otherwise( col("_dateOfExperiment")), col("mice._DOB"), ), ) dcc_experiment_df = dcc_experiment_df.withColumn( "ageInWeeks", udf(lambda x: math.floor(x / 7) if x is not None else None, IntegerType())(col("ageInDays")), ) return dcc_experiment_df.select("exp.*", "ageInWeeks", "ageInDays")
def process_fact_parking(sensordata_sdf: DataFrame, dim_parkingbay_sdf: DataFrame, dim_location_sdf: DataFrame, dim_st_marker_sdf: DataFrame, load_id, loaded_on): """Transform sensordata into fact_parking""" dim_date_id = loaded_on.strftime("%Y%M%d") midnight = loaded_on.replace(hour=0, minute=0, second=0, microsecond=0) dim_time_id = (midnight - loaded_on).seconds # Build fact fact_parking = sensordata_sdf\ .join(dim_parkingbay_sdf.alias("pb"), "bay_id", "left_outer")\ .join(dim_location_sdf.alias("l"), ["lat", "lon"], "left_outer")\ .join(dim_st_marker_sdf.alias("st"), "st_marker_id", "left_outer")\ .select( lit(dim_date_id).alias("dim_date_id"), lit(dim_time_id).alias("dim_time_id"), when(col("pb.dim_parking_bay_id").isNull(), lit(EMPTY_UUID)) .otherwise(col("pb.dim_parking_bay_id")).alias("dim_parking_bay_id"), when(col("l.dim_location_id").isNull(), lit(EMPTY_UUID)) .otherwise(col("l.dim_location_id")).alias("dim_location_id"), when(col("st.dim_st_marker_id").isNull(), lit(EMPTY_UUID)) .otherwise(col("st.dim_st_marker_id")).alias("dim_st_marker_id"), "status", lit(load_id).alias("load_id"), lit(loaded_on.isoformat()).cast("timestamp").alias("loaded_on") ) return fact_parking
def resolve_image_record_parameter_association( image_record_observation_df: DataFrame, simple_observations_df: DataFrame): simple_df = simple_observations_df.alias("simple") image_df = image_record_observation_df.alias("image").withColumn( "parameterAsc", explode("image.seriesMediaParameterValue.parameterAssociation")) image_vs_simple_parameters_df = image_df.join( simple_df, (col("simple.experiment_id") == col("image.experiment_id")) & (col("simple.parameter_stable_id") == col("parameterAsc._parameterID")), ) image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn( "paramName", col("simple.parameter_name")) image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn( "paramSeq", lit("0")) image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn( "paramValue", when(col("data_point").isNotNull(), col("data_point")).otherwise( when(col("category").isNotNull(), col("category")).otherwise(col("text_value"))), ) image_vs_simple_parameters_df = image_vs_simple_parameters_df.groupBy( col("image.observation_id"), col("image.parameter_stable_id")).agg( collect_set("parameterAsc._parameterID").alias("paramIDs"), collect_set("paramName").alias("paramNames"), collect_set("paramSeq").alias("paramSeqs"), collect_set("paramValue").alias("paramValues"), ) image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumnRenamed( "observation_id", "img_observation_id").withColumnRenamed("parameter_stable_id", "img_parameter_stable_id") image_vs_simple_parameters_df = image_vs_simple_parameters_df.select( "img_observation_id", "img_parameter_stable_id", "paramIDs", "paramNames", "paramSeqs", "paramValues", ) image_record_observation_df = image_record_observation_df.join( image_vs_simple_parameters_df, (image_record_observation_df["observation_id"] == image_vs_simple_parameters_df["img_observation_id"]) & (image_record_observation_df["parameter_stable_id"] == image_vs_simple_parameters_df["img_parameter_stable_id"]), "left_outer", ) image_record_observation_df = ( image_record_observation_df.withColumnRenamed( "paramIDs", "parameter_association_stable_id").withColumnRenamed( "paramNames", "parameter_association_name").withColumnRenamed( "paramSeqs", "parameter_association_sequence_id").withColumnRenamed( "paramValues", "parameter_association_value")) return image_record_observation_df
def process_dim_location(sensordata_sdf: DataFrame, dim_location: DataFrame, load_id, loaded_on): """Transform sensordata into dim_location""" # Get landing data distint rows sensordata_sdf = sensordata_sdf\ .select(["lat", "lon"]).distinct() # Using a left_outer join # identify rows that do NOT EXIST in landing data that EXISTS in existing Dimension table oldrows_sdf = dim_location.alias("dim")\ .join(sensordata_sdf, ["lat", "lon"], "left_outer")\ .where(sensordata_sdf["lat"].isNull() & sensordata_sdf["lon"].isNull())\ .select(col("dim.*")) # Using a left_outer join # Identify rows that EXISTS in incoming landing data that does also EXISTS in existing Dimension table # and take the values of the incoming landing data. That is, we update existing table values. existingrows_sdf = sensordata_sdf.alias("in")\ .join(dim_location.alias("dim"), ["lat", "lon"], "left_outer")\ .where(dim_location["lat"].isNotNull() & dim_location["lon"].isNotNull())\ .select( col("dim.dim_location_id"), col("in.lat"), col("in.lon") ) # Using a left_outer join # Identify rows that EXISTS in landing data that does NOT EXISTS in existing Dimension table newrows_sdf = sensordata_sdf.alias("in")\ .join(dim_location, ["lat", "lon"], "left_outer")\ .where(dim_location["lat"].isNull() & dim_location["lon"].isNull())\ .select(col("in.*")) # Add load_id, loaded_at and dim_parking_bay_id existingrows_sdf = existingrows_sdf.withColumn("load_id", lit(load_id))\ .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp")) newrows_sdf = newrows_sdf.withColumn("load_id", lit(load_id))\ .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))\ .withColumn("dim_location_id", uuidUdf()) # Select relevant columns relevant_cols = ["dim_location_id", "lat", "lon", "load_id", "loaded_on"] oldrows_sdf = oldrows_sdf.select(relevant_cols) existingrows_sdf = existingrows_sdf.select(relevant_cols) newrows_sdf = newrows_sdf.select(relevant_cols) allrows_sdf = oldrows_sdf\ .union(existingrows_sdf)\ .union(newrows_sdf) return allrows_sdf
def get_history_product(self, old_dataframe: DataFrame, new_dataframe: DataFrame): # We expect the same set of columns for both DF col_names = old_dataframe.columns # Exclude the column on which dataframes are joined col_names.remove('id') # Join DFs df_old = old_dataframe.alias("df_old") df_new = new_dataframe.alias("df_new") joined_df = df_old.join(df_new, on='id', how='outer') # Distinguish columns from old and new DF old_columns = [f'df_old.{c_name}' for c_name in col_names] new_columns = [f'df_new.{c_name}' for c_name in col_names] # Prepare expressions for finding the values in the new column old_columns_null_expr = ' IS NULL AND '.join(old_columns) + ' IS NULL' new_columns_null_expr = ' IS NULL AND '.join(new_columns) + ' IS NULL' equals_expr = '' for old_c, new_c in zip(old_columns, new_columns): equals_expr += f'{old_c} = {new_c} AND ' equals_expr = equals_expr.rstrip(' AND ') # Add 'meta' column meta_col = when(expr(equals_expr), 'not_changed').when( expr(old_columns_null_expr), 'inserted').when(expr(new_columns_null_expr), 'deleted').otherwise('changed') joined_df = joined_df.withColumn('meta', meta_col) # Create final set of columns final_df = joined_df for c_name in col_names: c_name_new_column = when(expr('meta = "deleted"'), expr(f'df_old.{c_name}')).otherwise( expr(f'df_new.{c_name}')) final_df = final_df.withColumn(f'new_{c_name}', c_name_new_column) # Drop temp columns and return result # result = joined_df.drop(*old_columns) DOESN'T WORK! PIECE OF SHIT result = final_df for c_name in col_names: # result = result.drop(new_dataframe[c_name]) PIECE OF SHIT! WORKS ONLY WITH ONE COLUMN # I have to use this hack because of the weird Spark behavior (see pieces of shits above) result = result.drop(c_name) result = result.withColumn(c_name, expr(f'new_{c_name}')) result = result.drop(f'new_{c_name}') return result.select('id', 'name', 'score', 'meta')
def transform_with_data_frame(self, df: DataFrame, source_df: DataFrame, keys: List[str]) -> DataFrame: # now add on my stuff column_spec: Column = self.get_column_spec(source_df=source_df) conditions = [col(f'b.{key}') == col(f'a.{key}') for key in keys] existing_columns: List[Column] = [ col('a.' + column_name) for column_name in df.columns ] result_df: DataFrame = df.alias('a').join( source_df.alias('b'), conditions).select(existing_columns + [column_spec]) return result_df
def transform_imm_city_demographics(imm_df: DataFrame, airport_detail_df: DataFrame, demo_graph_df: DataFrame) -> DataFrame: """Transform to imm_city_demographics dataframe.""" t1 = imm_df.alias('t1') t2 = airport_detail_df.alias('t2') # imm_df join with airport_detail_df to get city column imm_city_df = t1.join( t2, [t1.state_code == t2.state_code, t1.airport_code == t2.airport_code ]).select( col('t1.cicid').alias('cicid'), col('t1.month').alias('month'), col('t1.year').alias('year'), col('t1.birth_country').alias('birth_country'), col('t1.arrival_date').alias('arrival_date'), col('t1.departure_date').alias('departure_date'), col('t1.birth_year').alias('birth_year'), col('t1.gender').alias('gender'), col('t2.city').alias('city'), col('t2.name').alias('airport_name'), col('t1.state_code').alias('state_code'), col('t1.airport_code').alias('airport_code'), col('t1.visa_code').alias('visa_code')) # imm_city_df join with demo_graph_df to get city demographics info t1 = imm_city_df.alias('t1') t2 = demo_graph_df.alias('t2') imm_city_demograph_df = t1.join(t2, t1.city == t2.city).select( col('t1.cicid').alias('cicid'), col('t1.month').alias('month'), col('t1.year').alias('year'), col('t1.birth_country').alias('birth_country'), col('t1.arrival_date').alias('arrival_date'), col('t1.departure_date').alias('departure_date'), col('t1.birth_year').alias('birth_year'), col('t1.gender').alias('gender'), col('t1.city').alias('city'), col('t2.median_age').alias('city_median_age'), col('t2.male_population').alias('city_male_population'), col('t2.female_population').alias('city_female_population'), col('t2.total_population').alias('city_total_population'), col('t2.veteran_count').alias('city_veteran_count'), col('t2.foreign_born').alias('city_foreign_born_count'), col('t1.airport_name').alias('airport_name'), col('t1.state_code').alias('state_code'), col('t1.airport_code').alias('airport_code'), col('t1.visa_code').alias('visa_code')) return imm_city_demograph_df
def even_repartition(session: SparkSession, df: ps.DataFrame, num: int, cols: List[Any]) -> ps.DataFrame: if num == 1: return _single_repartition(df) if len(cols) == 0: if num == 0: return df rdd = (_zipWithIndex(df.rdd).partitionBy( num, lambda k: k).mapPartitions(_to_rows)) return session.createDataFrame(rdd, df.schema) else: keys = df.select(*cols).distinct() krdd = _zipWithIndex(keys.rdd, True) new_schema = to_spark_schema( to_schema(df.schema).extract(cols) + f"{_PARTITION_DUMMY_KEY}:long") idx = session.createDataFrame(krdd, new_schema) if num <= 0: idx = idx.persist() num = idx.count() idf = (df.alias("df").join(idx.alias("idx"), on=cols, how="inner").select(_PARTITION_DUMMY_KEY, "df.*")) def _to_kv(rows: Iterable[Any]) -> Iterable[Any]: # pragma: no cover for row in rows: yield (row[0], row[1:]) rdd = (idf.rdd.mapPartitions(_to_kv).partitionBy( num, lambda k: k).mapPartitions(_to_rows)) return session.createDataFrame(rdd, df.schema)
def process_dim_parking_bay(parkingbay_sdf: DataFrame, dim_parkingbay_sdf: DataFrame, load_id, loaded_on): """Transform sensordata into dim_parking_bay""" # Get landing data distint rows parkingbay_sdf = parkingbay_sdf\ .select([ "bay_id", "marker_id", "meter_id", "rd_seg_dsc", "rd_seg_id", "the_geom"])\ .distinct() # Get new rows to insert through LEFT JOIN with existing rows nr_parkingbay_sdf = parkingbay_sdf.alias("pb")\ .join(dim_parkingbay_sdf, "bay_id", "left_outer")\ .where(dim_parkingbay_sdf["bay_id"].isNull())\ .select(col("pb.*")) # Add load_id, loaded_at and dim_parking_bay_id nr_parkingbay_sdf = nr_parkingbay_sdf.withColumn("load_id", lit(load_id))\ .withColumn("loaded_on", lit(loaded_on.isoformat()))\ .withColumn("dim_parking_bay_id", uuidUdf()) # Select relevant columns nr_parkingbay_sdf = nr_parkingbay_sdf.select([ "dim_parking_bay_id", "bay_id", "marker_id", "meter_id", "rd_seg_dsc", "rd_seg_id", "the_geom", "load_id", "loaded_on" ]) return nr_parkingbay_sdf
def process_data_frame(destination_file_path: str, join_column_name: str, partition_column_name: str, spark: SparkSession, renamed_df: DataFrame): """ This function is used to process the csv files It first checks if the destination already exists. If not it will create it, and if it does exist, it will just append the new data. No data will be overwritten, so it is safe to run this function as many times with the same data as you want. """ destination_data = destination_file_path.split("/")[-1] try: existing_data = spark.read.parquet(destination_file_path).select(F.col(join_column_name)).distinct() except: existing_data = None if existing_data and existing_data is not None: renamed_df = (renamed_df.alias("new") .join(existing_data.alias("cur"), on=["match_id"], how="left_anti" ) .select("new.*") ) print("Processing " + str(renamed_df.count()) + f" new rows for {destination_data}.") renamed_df.write.format("parquet").mode("append").partitionBy(partition_column_name).save(destination_file_path) else: print(f"Processing complete csv file for {destination_data}, " + str(renamed_df.count()) + " rows.") renamed_df.write.format("parquet").mode("overwrite").partitionBy(partition_column_name).save(destination_file_path)
def find_similar_movies_by_movie_id( movie_id: int, movie_ratings_df: DataFrame, movie_names_df: DataFrame, similar_movies_count: int = 10, score_threshold: float = 0.97, co_occurrence_threshold: int = 50) -> DataFrame: # Remove useless columns to shrink dataframe before join. movie_ratings_df = movie_ratings_df.select('user_id', 'movie_id', 'rating') movies_similarities_df = cal_movies_similarities(movie_ratings_df).cache() movies_similarities_df = movies_similarities_df.repartition(100) similar_movies_df = movies_similarities_df. \ filter(((func.col('movie_id_1') == movie_id) | (func.col('movie_id_2') == movie_id)) & (func.col('co_occurrence_count') > co_occurrence_threshold) & (func.col('similarity_score') > score_threshold)). \ orderBy(func.desc('similarity_score'), func.desc('co_occurrence_count')).\ limit(similar_movies_count).\ select(func.when(func.col('movie_id_1') == movie_id, func.col('movie_id_2')).otherwise(func.col('movie_id_1')).alias('movie_id'), func.col('similarity_score'), func.col('co_occurrence_count') ) similar_movies_names_df = similar_movies_df.alias('sm').\ join(movie_names_df.alias('mn'), func.col('mn.id') == func.col('sm.movie_id')).\ select(func.col('sm.movie_id').alias('movie_id'), func.col('mn.name').alias('movie_name'), func.col('sm.similarity_score').alias('similarity_score'), func.col('sm.co_occurrence_count').alias('co_occurrence_count')).\ orderBy(func.desc('similarity_score'), func.desc('co_occurrence_count')) return similar_movies_names_df
def create_songplays(song_data: DataFrame, log_data: DataFrame) -> DataFrame: """ Process the song and log data and create table "songplays". Processing: - user ID and timestamp cannot be empty - convert unix timestamp into timestamp - join the song and log tables (explanation below) - add an ID column - select the relevant columns and rename them Since there is no song ID nor artist ID in the log data, we need to join both tables using song title, artist name and song duration, which should uniquely describe a song. And this is the best we can do, because there is no other information about the songs in the log data. Even though some songs have the same artist and title, they don't have the same duration. So we will not use this information to join the tables. Parameters ---------- song_data : pyspark.sql.dataframe.DataFrame Song dataset. log_data : pyspark.sql.dataframe.DataFrame Log dataset. Returns ------- songplays : pyspark.sql.dataframe.DataFrame """ return (log_data.alias('logs').where( (F.col('userId').isNotNull()) & (F.col('ts').isNotNull())).withColumn( 'start_time', F.from_unixtime(F.col('ts') / 1000).cast('timestamp')).join( song_data.alias('songs'), (F.col('logs.song') == F.col('songs.title')) & (F.col('logs.artist') == F.col('songs.artist_name')), 'inner').withColumn( 'songplay_id', F.monotonically_increasing_id()).select( 'songplay_id', 'start_time', F.year('start_time').alias('year'), F.month('start_time').alias('month'), F.col('logs.userId').alias('user_id'), 'logs.level', 'songs.song_id', 'songs.artist_id', F.col('logs.sessionId').alias('session_id'), 'logs.location', F.col('logs.userAgent').alias('user_agent')))
def cross_reference_specimens( self, specimen_df: DataFrame, colonies_df: DataFrame, entity_type: str ) -> DataFrame: """ Cross-reference Specimen data with colony tracking report information. """ specimen_df = specimen_df.alias("specimen") colonies_df = colonies_df.alias("colony") # join the specimen information with the colony information specimen_df = specimen_df.join( colonies_df, (specimen_df["_colonyID"] == colonies_df["colony_name"]), "left_outer", ) # generate the specimen allelic composition using the colony information # override 3i specimen project using the colony phenotyping consortium # override the data source and the project info for MGP and MGP Legacy colonies specimen_df = ( specimen_df.transform(self.generate_specimen_allelic_composition) .transform(override_europhenome_datasource) .transform(self.override_3i_specimen_project) ) specimen_df = specimen_df.withColumn( "_productionCentre", when( col("_productionCentre").isNull(), when( col("_phenotypingCentre").isNotNull(), col("_phenotypingCentre") ).otherwise(col("colony.production_centre")), ).otherwise(col("_productionCentre")), ) specimen_df = specimen_df.select( "specimen.*", "_productionCentre", "allelicComposition", "colony.phenotyping_consortium", ) if entity_type == "embryo": specimen_df = specimen_df.transform(self.add_embryo_life_stage_acc) if entity_type == "mouse": specimen_df = specimen_df.transform(self.add_mouse_life_stage_acc) return specimen_df
def cal_movies_similarities(movie_ratings_df: DataFrame): """ Calculate similarity of movie pairs based on their co-occurrence and the cosine similarity of their ratings when watched by same person. :param movie_ratings_df: :return: """ # Find all pair of different movies watched by the same person. # func.col('mr1.movie_id') < func.col('mr2.movie_id') to avoid duplication. # Parenthesis is mandatory for combined condition (e.g. &, |) ratings_pairs_df = movie_ratings_df.alias('mr1'). \ join(movie_ratings_df.alias('mr2'), (func.col('mr1.user_id') == func.col('mr2.user_id')) & ( func.col('mr1.movie_id') < func.col('mr2.movie_id'))). \ select( func.col('mr1.movie_id').alias('movie_id_1'), func.col('mr2.movie_id').alias('movie_id_2'), func.col('mr1.rating').alias('rating_1'), func.col('mr2.rating').alias('rating_2') ) # Calculate dot product (numerator) and magnitude (denominator) of cosine similarity equation. # Each movie is considered a vector of its ratings. ratings_pairs_df = ratings_pairs_df.groupBy('movie_id_1', 'movie_id_2'). \ agg(func.sum(func.col('rating_1') * func.col('rating_2')).alias('sim_dot_product'), (func.sqrt(func.sum(func.pow(func.col('rating_1'), 2))) * func.sqrt( func.sum(func.pow(func.col('rating_2'), 2)))).alias('sim_magnitude'), func.count(func.col('movie_id_1')).alias('co_occurrence_count') ) # Calculate cosine similarity as a new column: # (doc product of two movie ratings / doc product of two magnitude ratings) movies_similarities_df = ratings_pairs_df. \ withColumn('similarity_score', func.when(func.col('sim_magnitude') != 0, func.col('sim_dot_product') / func.col('sim_magnitude')).otherwise(0) ).select('movie_id_1', 'movie_id_2', 'similarity_score', 'co_occurrence_count') return movies_similarities_df
def find_movie_recommendations_by_user_id( user_id: int, ratings_df: DataFrame, movie_names_df: DataFrame, spark: SparkSession) -> Union[DataFrame, bool]: """ Classification Algorithm: Use Collaborative filtering to find similar movies to recommend new movies based on their similarity Collaborative filtering is based on the assumption that people who agreed in the past will agree in the future, and that they will like similar kinds of items as they liked in the past. https://spark.apache.org/docs/3.0.1/ml-collaborative-filtering.html :param search_movie_df: :param ratings_df: :param movie_names_df: :return: """ als_model = train_als_model_by_ratings(ratings_df) # Examples of find recommendations for top numItems movies (items) or users # // Generate top 10 movie recommendations for each user # userRecs = model.recommendForAllUsers(numItems) # // Generate top 10 user recommendations for each movie # movieRecs = model.recommendForAllItems(numItems) # // Generate top 10 movie recommendations for a specified set of users (in dataframe format). # userSubsetRecs = model.recommendForUserSubset(users, numItems) # // Generate top 10 user recommendations for a specified set of movies (in dataframe format). # movieSubSetRecs = model.recommendForItemSubset(movies, numItems) numItems = 10 # The recommendForUserSubset input must be a dataframe which only include one column called "user_id". user_schema = types.StructType( [types.StructField('user_id', types.IntegerType())]) users_df = spark.createDataFrame([[ user_id, ]], user_schema) # movie_recs_df consists of a dataframe for Rows with two columns "user_id" and "recommendations". # Each row hold the the recommendations for each user in users_df subset (users_df has only one item here.) # "recommendations" are stored as an array of (movie_id, rating) Rows. # "rating" is actually predicted rating that the given give to each movie from 10. movies_recs_df = als_model.recommendForUserSubset(users_df, numItems) if not movies_recs_df.take(1): return False movie_rating_df = spark.createDataFrame( movies_recs_df.collect()[0].recommendations) movie_rating_df = movie_rating_df.alias('mr').join(movie_names_df.alias('mn'), func.col('mr.movie_id') == func.col('mn.id')).\ select(func.col('mr.movie_id').alias('id'), func.col('mn.name').alias('name'), func.col('mr.rating').alias('predicted_rating')) return movie_rating_df
def override_3i_specimen_data(dcc_specimen_df: DataFrame) -> DataFrame: dcc_specimen_df_a = dcc_specimen_df.alias("a") dcc_specimen_df_b = dcc_specimen_df.alias("b") dcc_specimen_df = dcc_specimen_df_a.join( dcc_specimen_df_b, (dcc_specimen_df_a["_specimenID"] == dcc_specimen_df_b["_specimenID"]) & (dcc_specimen_df_a["_centreID"] == dcc_specimen_df_b["_centreID"]) & (dcc_specimen_df_a["_dataSource"] != dcc_specimen_df_b["_dataSource"]), "left_outer", ) dcc_specimen_df = dcc_specimen_df.where( col("b._specimenID").isNull() | ((col("b._specimenID").isNotNull()) & (col("a._dataSource") != "3i"))) return dcc_specimen_df.select("a.*")
def transform_with_data_frame(self, df: DataFrame, source_df: Optional[DataFrame], keys: List[str]) -> DataFrame: assert source_df # now add on my stuff column_spec: Column = self.get_column_spec(source_df=source_df) conditions = [col(f"b.{key}") == col(f"a.{key}") for key in keys] existing_columns: List[Column] = [ col("a." + column_name) for column_name in df.columns ] result_df: DataFrame = (df.alias("a").join( source_df.alias("b"), conditions).select(existing_columns + [column_spec])) return result_df
def update_sum_count_or_insert(self, new_data: DataFrame, table: str, id_col: str): try: delta_table = DeltaTable.forPath(self.spark_configuration.spark_session, self.delta_src + table) except AnalysisException: # If delta table not exists just create it new_data.write \ .format("delta") \ .save(self.delta_src + table) return delta_table.alias("current_data").merge( new_data.alias("updates"), "current_data.{0} = updates.{0}".format(id_col))\ .whenMatchedUpdate(set={ "count": "current_data.count + updates.count" }) \ .whenNotMatchedInsertAll() \ .execute()
def merge( self, df: DataFrame, location: str, condition: str, # Only supports SQL-like string condition match_update_dict: dict, # "target_column": "expression" insert_when_not_matched: False, # Set to True for upsert save_mode: str = 'table'): '''Merge a dataframe to target table or path. This merge operation can represent both update and upsert operation. Source and target table is defaultly alias-ed as 'SRC' and 'TGT'. This could be used in condition string and update/insert expressions. Args: df (DataFrame): The source dataframe to write. save_mode (str): 'table' or 'path' location (str): The table name or path to be merge into. condition (str): The condition in SQL-like string form. match_update_dict (dict): Contains ("target_column": "expression"). This represents the updated value if matched. NOTE: "target_column"'s come without schema ("SRC" or "TGT"). not_match_insert_dict (dict): Contains ("target_column": "expression"). This represents the inserted value if not matched. Other columns which are not specified shall be null. NOTE: "target_column"'s come without schema ("SRC" or "TGT"). ''' super(DeltaDataSource, self).merge(df, condition, match_update_dict, insert_when_not_matched=insert_when_not_matched) save_mode = save_mode.lower() if save_mode == "table": target_table = DeltaTable.forName(self.spark, location) elif save_mode == "path": target_table = DeltaTable.forPath(self.spark, location) else: raise ValueError("save_mode should be 'path' or 'table'.") merger = target_table.alias("TGT").merge(df.alias("SRC"), condition) merger = merger.whenMatchedUpdate(set=match_update_dict) if insert_when_not_matched: merger = merger.whenNotMatchedInsert(values=match_update_dict) merger.execute()
def _upsert_save(self, data: DataFrame) -> None: if self._load().rdd.isEmpty(): self._insert_save(data) else: joined_data = data.alias("new").join(self._load().alias("old"), self._table_pk, "outer") upsert_dataset = joined_data.select([ # type: ignore coalesce(f"new.{col_name}", f"old.{col_name}").alias(col_name) for col_name in set(data.columns) - set(self._table_pk) # type: ignore ] + self._table_pk) temporary_persisted_tbl_name = f"temp_{uuid.uuid4().int}" with StagedHiveDataSet( upsert_dataset, stage_database_name=self._database, stage_table_name=temporary_persisted_tbl_name, ) as temp_table: self._overwrite_save(temp_table.staged_data)
def override_3i_specimen_data(self, dcc_specimen_df: DataFrame) -> DataFrame: """ Whenever a Specime is presetn both in the 3i project and any other data source (e.g. EuroPhenome or IMPC) the other data source specimen data should be used instead of the 3i one. """ dcc_specimen_df_a = dcc_specimen_df.alias("a") dcc_specimen_df_b = dcc_specimen_df.alias("b") dcc_specimen_df = dcc_specimen_df_a.join( dcc_specimen_df_b, (dcc_specimen_df_a["_specimenID"] == dcc_specimen_df_b["_specimenID"]) & (dcc_specimen_df_a["_centreID"] == dcc_specimen_df_b["_centreID"]) & (dcc_specimen_df_a["_dataSource"] != dcc_specimen_df_b["_dataSource"]), "left_outer", ) dcc_specimen_df = dcc_specimen_df.where( col("b._specimenID").isNull() | ((col("b._specimenID").isNotNull()) & (col("a._dataSource") != "3i"))) return dcc_specimen_df.select("a.*").dropDuplicates()
def process_dim_parking_bay(parkingbay_sdf: DataFrame, dim_parkingbay_sdf: DataFrame, load_id, loaded_on): """Transform incoming parkingbay_sdf data and existing dim_parking_bay into the latest version of records of dim_parking_bay data. """ # Get landing data distint rows parkingbay_sdf = parkingbay_sdf\ .select([ "bay_id", "marker_id", "meter_id", "rd_seg_dsc", "rd_seg_id"])\ .distinct() # Using a left_outer join on the business key (bay_id), # identify rows that do NOT EXIST in landing data that EXISTS in existing Dimension table oldrows_parkingbay_sdf = dim_parkingbay_sdf.alias("dim")\ .join(parkingbay_sdf, "bay_id", "left_outer")\ .where(parkingbay_sdf["bay_id"].isNull())\ .select(col("dim.*")) # Using a left_outer join on the business key (bay_id), # Identify rows that EXISTS in incoming landing data that does also EXISTS in existing Dimension table # and take the values of the incoming landing data. That is, we update existing table values. existingrows_parkingbay_sdf = parkingbay_sdf.alias("pb")\ .join(dim_parkingbay_sdf.alias("dim"), "bay_id", "left_outer")\ .where(dim_parkingbay_sdf["bay_id"].isNotNull())\ .select( col("dim.dim_parking_bay_id"), col("pb.bay_id"), col("pb.marker_id"), col("pb.meter_id"), col("pb.rd_seg_dsc"), col("pb.rd_seg_id") ) # Using a left_outer join on the business key (bay_id), # Identify rows that EXISTS in landing data that does NOT EXISTS in existing Dimension table newrows_parkingbay_sdf = parkingbay_sdf.alias("pb")\ .join(dim_parkingbay_sdf, "bay_id", "left_outer")\ .where(dim_parkingbay_sdf["bay_id"].isNull())\ .select(col("pb.*")) # Add load_id, loaded_at and dim_parking_bay_id existingrows_parkingbay_sdf = existingrows_parkingbay_sdf.withColumn("load_id", lit(load_id))\ .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp")) newrows_parkingbay_sdf = newrows_parkingbay_sdf.withColumn("load_id", lit(load_id))\ .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))\ .withColumn("dim_parking_bay_id", uuidUdf()) # Select relevant columns relevant_cols = [ "dim_parking_bay_id", "bay_id", "marker_id", "meter_id", "rd_seg_dsc", "rd_seg_id", "load_id", "loaded_on" ] oldrows_parkingbay_sdf = oldrows_parkingbay_sdf.select(relevant_cols) existingrows_parkingbay_sdf = existingrows_parkingbay_sdf.select( relevant_cols) newrows_parkingbay_sdf = newrows_parkingbay_sdf.select(relevant_cols) allrows_parkingbay_sdf = oldrows_parkingbay_sdf\ .union(existingrows_parkingbay_sdf)\ .union(newrows_parkingbay_sdf) return allrows_parkingbay_sdf
def get_associated_body_weight( self, specimen_level_experiment_df: DataFrame, mouse_specimen_df: DataFrame, impress_df: DataFrame, ) -> DataFrame: """ Takes in DataFrame with Experimental data, one with Mouse Specimens and one with Impress information, and applies the algorithm to select the associated BW to any given experiment and calculate the age of experiment for the selected BW measurement. """ # Explode the nested experiment DF structure so every row represents an observation weight_observations: DataFrame = specimen_level_experiment_df.withColumn( "simpleParameter", explode_outer("simpleParameter")) # Select the parameter relevant pieces from the IMPReSS DF parameters = impress_df.select( "pipelineKey", "procedure.procedureKey", "parameter.parameterKey", "parameter.analysisWithBodyweight", ).distinct() # Filter the IMPReSS using the analysisWithBodyweight flag weight_parameters = parameters.where( col("analysisWithBodyweight").isin( ["is_body_weight", "is_fasted_body_weight"])) # Join both the observations DF and the BW parameters DF to obtain the observations that are BW weight_observations = weight_observations.join( weight_parameters, ((weight_observations["_pipeline"] == weight_parameters["pipelineKey"]) & (weight_observations["_procedureID"] == weight_parameters["procedureKey"]) & (weight_observations["simpleParameter._parameterID"] == weight_parameters["parameterKey"])), ) # Create a boolean flag for fasted BW procedures weight_observations = weight_observations.withColumn( "weightFasted", col("analysisWithBodyweight") == "is_fasted_body_weight") weight_observations = weight_observations.select( "specimenID", "_centreID", col("unique_id").alias("sourceExperimentId"), col("_dateOfExperiment").alias("weightDate"), col("simpleParameter._parameterID").alias("weightParameterID"), col("simpleParameter.value").alias("weightValue"), "weightFasted", ) weight_observations = weight_observations.where( col("weightValue").isNotNull()) # Join the body weight observations so we can determine the age of the specimen for any BW measurement weight_observations = weight_observations.join( mouse_specimen_df, (weight_observations["specimenID"] == mouse_specimen_df["_specimenID"]) & (weight_observations["_centreID"] == mouse_specimen_df["_centreID"]), ) weight_observations = weight_observations.withColumn( "weightDaysOld", datediff("weightDate", "_DOB")) # Group the weight observations by Specimen weight_observations = weight_observations.groupBy("specimenID").agg( collect_set( struct( "sourceExperimentId", "weightDate", "weightParameterID", "weightValue", "weightDaysOld", "weightFasted", )).alias("weight_observations")) # Create a temporary "procedureGroup" column to be used in the BW selection specimen_level_experiment_df = specimen_level_experiment_df.withColumn( "procedureGroup", udf(lambda prod_id: prod_id[:prod_id.rfind("_")], StringType())(col("_procedureID")), ) # Join all the observations with the BW observations grouped by specimen specimen_level_experiment_df = specimen_level_experiment_df.join( weight_observations, "specimenID", "left_outer") # Schema for the struct that is going to group all the associated BW data output_weight_schema = StructType([ StructField("sourceExperimentId", StringType()), StructField("weightDate", DateType()), StructField("weightParameterID", StringType()), StructField("weightValue", StringType()), StructField("weightDaysOld", IntegerType()), StructField("error", ArrayType(StringType())), ]) # Alias both the experiment and the specimen df so is easier to join and manipulate experiment_df_a = specimen_level_experiment_df.alias("exp") mice_df_a = mouse_specimen_df.alias("mice") specimen_level_experiment_df = experiment_df_a.join( mice_df_a, (specimen_level_experiment_df["specimenID"] == mouse_specimen_df["_specimenID"]) & (specimen_level_experiment_df["_centreID"] == mouse_specimen_df["_centreID"]), "left_outer", ) # Add special dates to the experiment x specimen dataframe # for some experiments the date of sacrifice or date of blood collection # has to be used as reference for age and BW calculations specimen_level_experiment_df = self._add_special_dates( specimen_level_experiment_df) get_associated_body_weight_udf = udf(self._get_closest_weight, output_weight_schema) specimen_level_experiment_df = specimen_level_experiment_df.withColumn( "weight", get_associated_body_weight_udf( when( col("_dateOfBloodCollection").isNotNull(), col("_dateOfBloodCollection"), ).when( col("_dateOfSacrifice").isNotNull(), col("_dateOfSacrifice")).otherwise( col("_dateOfExperiment")), col("procedureGroup"), col("weight_observations"), ), ) specimen_level_experiment_df = specimen_level_experiment_df.select( "exp.*", "weight") return specimen_level_experiment_df
def analyze_stream_data(lrModelList=None, batch_summary_df: DataFrame = None): os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 ' \ 'pyspark-shell' spark = SparkSession.builder.appName("TwitterSparkStream").getOrCreate() spark.conf.set("spark.sql.crossJoin.enabled", "true") schema = StructType([ StructField("timestamp", TimestampType(), True), StructField("text", StringType(), True), StructField("sentiment", DoubleType(), True) ]) # initialize Spark Structured Stream with KafkaConsumer subscribing to topic "tweet" from twitter_stream.py # parse the kafka stream to the schema stream_df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "tweet") \ .option("startingOffsets", "latest") \ .load() \ .select(from_json(col("value").cast("string"), schema).alias("data")) \ .select(col("data.timestamp").alias("timestamp"), col("data.text").alias("text"), col("data.sentiment").alias("sentiment")) # set window of 30 seconds with watermark of 10 seconds and aggregate min, avg, and max sentiment over the windows, # as well as the amount of tweets with positive, negative and neutral sentiment in that window windowed_stream_df = stream_df.withWatermark("timestamp", "10 seconds") \ .groupBy(window("timestamp", "30 seconds")) \ .agg(avg("sentiment").alias("avg_sentiment"), max("sentiment").alias("max_sentiment"), min("sentiment").alias("min_sentiment"), count(when(stream_df["sentiment"] > 0.1, True)).alias("pos_tweet_count_stream"), count(when((stream_df["sentiment"] <= 0.1) & (stream_df["sentiment"] >= -0.1), True)).alias( "neutral_tweet_count_stream"), count(when(stream_df["sentiment"] < -0.1, True)).alias("neg_tweet_count_stream")) windowed_stream_df = windowed_stream_df.select( windowed_stream_df.window.start.alias("start"), "min_sentiment", "avg_sentiment", "max_sentiment", "pos_tweet_count_stream", "neutral_tweet_count_stream", "neg_tweet_count_stream").sort("start", ascending=False) if lrModelList is not None: # if there are LinearRegression Models as function arguments, predict the amount of positive, negative and # neutral tweets, based on the models trained on the offline data in spark_offline.py featAssembler = VectorAssembler( inputCols=["avg_sentiment", "max_sentiment", "min_sentiment"], outputCol="features") windowed_stream_df = featAssembler.transform(windowed_stream_df) windowed_stream_df = lrModelList[0].transform(windowed_stream_df) windowed_stream_df = lrModelList[1].transform(windowed_stream_df) windowed_stream_df = lrModelList[2].transform(windowed_stream_df) windowed_stream_df = windowed_stream_df.select( "start", "avg_sentiment", "max_sentiment", "min_sentiment", "pos_tweet_count_stream", "pos_tweet_count_pred", "neutral_tweet_count_stream", "neutral_tweet_count_pred", "neg_tweet_count_stream", "neg_tweet_count_pred") if batch_summary_df is not None: # if there is a dataframe as function argument with the average values of the specific metrics from offline data # then calculate the difference between the amount of positive, negative and neutral tweets in the online data # with the average of the specific metric from the offline data for a comparison windowed_stream_df = windowed_stream_df.withColumn("tmp1", lit("mean")) windowed_stream_df = windowed_stream_df.alias("stream").join( batch_summary_df.alias("batch"), col("stream.tmp1") == col("batch.summary"), "inner") windowed_stream_df = windowed_stream_df\ .withColumn("pos_tweet_count_diff", col("pos_tweet_count_stream") - col("pos_tweet_count_batch"))\ .withColumn("neutral_tweet_count_diff", col("neutral_tweet_count_stream") - col("neutral_tweet_count_batch")) \ .withColumn("neg_tweet_count_diff", col("neg_tweet_count_stream") - col("neg_tweet_count_batch")) windowed_stream_df = windowed_stream_df.select( "start", "min_sentiment", "avg_sentiment", "max_sentiment", "pos_tweet_count_batch", "pos_tweet_count_stream", "pos_tweet_count_diff", "pos_tweet_count_pred", "neutral_tweet_count_batch", "neutral_tweet_count_stream", "neutral_tweet_count_diff", "neutral_tweet_count_pred", "neg_tweet_count_batch", "neg_tweet_count_stream", "neg_tweet_count_diff", "neg_tweet_count_pred") # write spark stream to console in output mode "complete" query = windowed_stream_df.writeStream\ .format("console")\ .outputMode("complete")\ .option("truncate", False)\ .start().awaitTermination()
def map_experiments_to_observations( experiment_df: DataFrame, line_df: DataFrame, mouse_df: DataFrame, embryo_df, allele_df: DataFrame, colony_df: DataFrame, pipeline_df: DataFrame, strain_df: DataFrame, ontology_df: DataFrame, ): experiment_df = experiment_df.withColumnRenamed("_sourceFile", "experiment_source_file") experiment_df = experiment_df.withColumnRenamed("unique_id", "experiment_id") experiment_df = experiment_df.alias("experiment") colony_df = colony_df.alias("colony") embryo_df = embryo_df.withColumn("_DOB", lit(None).cast(StringType())) embryo_df = embryo_df.withColumn("_VALUE", lit(None).cast(StringType())) mouse_df = mouse_df.withColumn("_stage", lit(None).cast(StringType())) mouse_df = mouse_df.withColumn("_stageUnit", lit(None).cast(StringType())) specimen_df = mouse_df.union(embryo_df.select(mouse_df.columns)) specimen_df = specimen_df.withColumnRenamed("_sourceFile", "specimen_source_file") specimen_df = specimen_df.withColumnRenamed("unique_id", "specimen_id") specimen_df = specimen_df.alias("specimen") allele_df = allele_df.alias("allele") strain_df = strain_df.alias("strain") observation_df: DataFrame = experiment_df.join( specimen_df, (experiment_df["experiment._centreID"] == specimen_df["specimen._centreID"]) & (experiment_df["experiment.specimenID"] == specimen_df["specimen._specimenID"]), "left_outer", ) observation_df = observation_df.join( colony_df, (observation_df["specimen._colonyID"] == colony_df["colony.colony_name"]), "left_outer", ) observation_df = observation_df.join( allele_df, observation_df["colony.allele_symbol"] == allele_df["allele.alleleSymbol"], "left_outer", ) experimental_observation_df = observation_df.where( (lower(col("specimen._colonyID")) != "baseline") & (col("specimen._isBaseline") != True)).join( strain_df, col("colony.colony_background_strain") == col("strain.strainName"), "left_outer", ) baseline_observation_df = observation_df.where( (lower(col("specimen._colonyID")) == "baseline") | (col("specimen._isBaseline") == True)).join( strain_df, when( concat(lit("MGI:"), col("specimen._strainID")) == col("strain.mgiStrainID"), concat(lit("MGI:"), col("specimen._strainID")) == col("strain.mgiStrainID"), ).otherwise(col("specimen._strainID") == col("strain.strainName")), "left_outer", ) ## TODO fallback to imits when its missing and do the join again observation_df = baseline_observation_df.union(experimental_observation_df) simple_observation_df = process_parameter_values(observation_df, pipeline_df, "simpleParameter") simple_observation_df = add_observation_type(simple_observation_df) simple_observation_df = resolve_simple_value(simple_observation_df, pipeline_df) simple_observation_df = unify_schema(simple_observation_df).select( Constants.OBSERVATION_COLUMNS) line_df = (line_df.withColumnRenamed( "_sourceFile", "experiment_source_file").withColumnRenamed( "unique_id", "experiment_id").withColumn("specimen_source_file", lit(None)).alias("experiment")) line_observation_df = line_df.join( colony_df, line_df["_colonyID"] == colony_df["colony.colony_name"]) line_observation_df = line_observation_df.join( strain_df, col("colony.colony_background_strain") == col("strain.strainName")) line_observation_df = line_observation_df.join( allele_df, observation_df["colony.allele_symbol"] == allele_df["allele.alleleSymbol"], "left_outer", ) line_simple_observation_df = process_parameter_values(line_observation_df, pipeline_df, "simpleParameter", exp_type="line") line_simple_observation_df = add_observation_type( line_simple_observation_df) line_simple_observation_df = resolve_simple_value( line_simple_observation_df, pipeline_df) line_simple_observation_df = line_simple_observation_df.withColumn( "specimen_id", lit(None)) line_simple_observation_df = unify_schema( line_simple_observation_df).select(Constants.OBSERVATION_COLUMNS) simple_observation_df = simple_observation_df.union( line_simple_observation_df) body_weight_curve_observation_df = get_body_weight_curve_observations( simple_observation_df.where( col("observation_type") == "unidimensional")) simple_media_observation_df = process_parameter_values( observation_df, pipeline_df, "mediaParameter") if simple_media_observation_df is not None: simple_media_observation_df = resolve_simple_media_value( simple_media_observation_df) simple_media_observation_df = unify_schema( simple_media_observation_df).select(Constants.OBSERVATION_COLUMNS) ontological_observation_df = process_parameter_values( observation_df, pipeline_df, "ontologyParameter") ontological_observation_df = resolve_ontology_value( ontological_observation_df, ontology_df) ontological_observation_df = unify_schema( ontological_observation_df).select(Constants.OBSERVATION_COLUMNS) time_series_observation_df = process_parameter_values( observation_df, pipeline_df, "seriesParameter") time_series_observation_df = resolve_time_series_value( time_series_observation_df) time_series_observation_df = unify_schema( time_series_observation_df).select(Constants.OBSERVATION_COLUMNS) image_record_observation_df = process_parameter_values( observation_df, pipeline_df, "seriesMediaParameter") image_record_observation_df = resolve_image_record_value( image_record_observation_df) image_record_observation_df = resolve_image_record_parameter_association( image_record_observation_df, simple_observation_df) image_record_observation_df = unify_schema( image_record_observation_df).select(Constants.OBSERVATION_COLUMNS) observation_df = ( simple_observation_df.union(ontological_observation_df).union( image_record_observation_df).union(time_series_observation_df). union(body_weight_curve_observation_df)) if simple_media_observation_df is not None: observation_df = observation_df.union(simple_media_observation_df) observation_df = observation_df.where(col("parameter_status").isNull()) observation_df = format_columns(observation_df).drop_duplicates() observation_df = observation_df.withColumn( "experiment_source_file", regexp_extract(col("experiment_source_file"), "(.*\/)(.*\/.*\.xml)", idx=2), ) observation_df = observation_df.withColumn( "specimen_source_file", regexp_extract(col("specimen_source_file"), "(.*\/)(.*\/.*\.xml)", idx=2), ) observation_df = observation_df.withColumn("life_stage_name", lit(None)) observation_df = observation_df.withColumn("life_stage_acc", lit(None)) for life_stage in Constants.PROCEDURE_LIFE_STAGE_MAPPER: life_stage_name = life_stage["lifeStage"] observation_df = observation_df.withColumn( "life_stage_name", when( col("life_stage_name").isNull(), when( (col("procedure_stable_id").rlike("|".join( [f"({ proc })" for proc in life_stage["procedures"]])) | (col("developmental_stage_name") == life_stage_name)), lit(life_stage_name), ).otherwise(lit(None)), ).otherwise(col("life_stage_name")), ) observation_df = observation_df.withColumn( "life_stage_acc", when( col("life_stage_acc").isNull(), when( (col("procedure_stable_id").rlike("|".join( [f"({ proc })" for proc in life_stage["procedures"]])) | (col("developmental_stage_name") == life_stage_name)), lit(life_stage["lifeStageAcc"]), ).otherwise(lit(None)), ).otherwise(col("life_stage_acc")), ) observation_df = observation_df.withColumn( "life_stage_name", when((col("life_stage_name").isNull()), lit("Early adult")).otherwise(col("life_stage_name")), ) observation_df = observation_df.withColumn( "life_stage_acc", when((col("life_stage_acc").isNull()), lit("IMPCLS:0005")).otherwise(col("life_stage_acc")), ) return observation_df
def alias(self, *args, **kwargs): return DataFrame.alias(self, *args, **kwargs)
def transform_with_data_frame_single_select(self, df: DataFrame, source_df: DataFrame, keys: List[str]) -> DataFrame: # get all the column specs column_specs: List[Column] = [ child_mapper.get_column_specs(source_df=source_df)[column_name] for column_name, child_mapper in self.mappers.items() ] try: # print("========== NEW 2 ==============") if not self.drop_key_columns: column_specs = [col(f"b.{c}") for c in keys] + column_specs if self.enable_logging: print(f"-------- automapper ({self.view}) column specs ------") print(self.to_debug_string(source_df=source_df)) print( f"-------- end automapper ({self.view}) column specs ------" ) print( f"-------- automapper ({self.source_view}) source_df schema ------" ) source_df.printSchema() print( f"-------- end automapper ({self.source_view}) source_df schema ------" ) if self.check_schema_for_all_columns: for column_name, mapper in self.mappers.items(): check_schema_result: Optional[ CheckSchemaResult] = mapper.check_schema( parent_column=None, source_df=source_df) if check_schema_result and len( check_schema_result.result.errors) > 0: print( f"==== ERROR: Schema Mismatch [{column_name}] ===" f"{str(check_schema_result)}") else: print(f"==== Schema Matches: [{column_name}] ====") # run all the selects df = source_df.alias("b").select(*column_specs) # write out final checkpoint for this automapper if self.checkpoint_path: checkpoint_path = Path(self.checkpoint_path).joinpath( self.view or "df").joinpath("final") df.write.parquet(str(checkpoint_path)) df = df.sql_ctx.read.parquet(str(checkpoint_path)) except AnalysisException: # iterate through each column to find the problem child for column_name, mapper in self.mappers.items(): try: print(f"========= Processing {column_name} =========== ") column_spec = mapper.get_column_specs( source_df=source_df)[column_name] source_df.alias("b").select(column_spec).limit(1).count() print( f"========= Done Processing {column_name} =========== " ) except AnalysisException as e2: print( f"========= checking Schema {column_name} =========== " ) check_schema_result = mapper.check_schema( parent_column=None, source_df=source_df) msg: str = "" if e2.desc.startswith("cannot resolve 'array"): msg = "Looks like the elements of the array have different structures. " \ "All items in an array should have the exact same structure. " \ "You can pass in include_nulls to AutoMapperDataTypeComplexBase to force it to create " \ "null values for each element in the structure. " msg += self.get_message_for_exception( column_name + ": " + str(check_schema_result), df, e2, source_df) raise AutoMapperException(msg) from e2 except Exception as e: print("==== OOPS ===========") msg = self.get_message_for_exception("", df, e, source_df) raise Exception(msg) from e print(f"========= Finished AutoMapper {self.view} =========== ") return df
def get_history_product(self, old_dataframe: DataFrame, new_dataframe: DataFrame): cols = old_dataframe.columns try: pk = self.primary_keys expressions = [] for p in pk: expressions.append(f'old.{p} <=> new.{p}') expression = ' and '.join(expressions) except TypeError: expressions = [] for c in cols: expressions.append(f'old.{c} <=> new.{c}') expression = ' and '.join(expressions) not_changed_keys = [] for c in cols: not_changed_keys.append(f'old.{c} <=> new.{c}') not_changed_expression = ' and '.join(not_changed_keys) old_dataframe = old_dataframe.alias('old') new_dataframe = new_dataframe.alias('new') not_changed_rows = old_dataframe.join(new_dataframe, expr(not_changed_expression), how='semi') changed_rows_new = new_dataframe.join(old_dataframe, expr(expression), how='semi')\ .exceptAll(not_changed_rows) changed_rows_old = old_dataframe.join(new_dataframe, expr(expression), how='semi') \ .exceptAll(not_changed_rows) old_rows = old_dataframe\ .exceptAll(not_changed_rows) \ .exceptAll(changed_rows_old) new_rows = new_dataframe\ .exceptAll(not_changed_rows) \ .exceptAll(changed_rows_new) not_changed_rows = not_changed_rows.withColumn('meta', lit('not_changed')) changed_rows = changed_rows_new.withColumn('meta', lit('changed')) old_rows = old_rows.withColumn('meta', lit('deleted')) new_rows = new_rows.withColumn('meta', lit('inserted')) # for r in not_changed_rows.collect(): # print(r) # # for r in changed_rows.collect(): # print(r) # # for r in old_rows.collect(): # print(r) # # for r in new_rows.collect(): # print(r) # print('=============================================') result_dateframe = not_changed_rows\ .union(changed_rows)\ .union(old_rows)\ .union(new_rows)\ .sort('id', 'name') # resdf = result_dateframe.collect() # for d in resdf: # print(d) return result_dateframe
def _transform_with_data_frame_single_select( self, df: DataFrame, source_df: DataFrame, keys: List[str] ) -> DataFrame: """ This internal function transforms the data frame using the mappings in a single select command :param df: destination data frame :param source_df: source data frame :param keys: key columns """ # get all the column specs column_specs: List[Column] = [ child_mapper.get_column_specs(source_df=source_df)[column_name] for column_name, child_mapper in self.mappers.items() ] try: # print("========== NEW 2 ==============") if self.copy_all_unmapped_properties: # find all source properties not mapped source_properties: List[str] = source_df.columns mapped_properties: List[str] = list(self.mappers.keys()) unmapped_properties: List[str] = [ p for p in source_properties if p not in mapped_properties ] copy_all_unmapped_properties_exclude: List[str] = ( self.copy_all_unmapped_properties_exclude or [] ) column_schema: Dict[str, StructField] = ( {f.name: f for f in source_df.schema} if self.use_schema else {} ) # for each unmapped property add a simple A.column() column_specs.extend( [ AutoMapperDataTypeColumn(column_name) .get_column_spec( source_df=source_df, current_column=None, parent_columns=None, ) .cast(column_schema[column_name].dataType) if column_name in column_schema else AutoMapperDataTypeColumn(column_name).get_column_spec( source_df=source_df, current_column=None, parent_columns=None, ) for column_name in unmapped_properties if column_name not in copy_all_unmapped_properties_exclude ] ) if not self.drop_key_columns: column_specs = [col(f"b.{c}") for c in keys] + column_specs print(f"COLUMN SPECS --- {column_specs}") self.logger.debug(f"-------- automapper ({self.view}) column specs ------") self.logger.debug(self.to_debug_string(source_df=source_df)) self.logger.debug( f"-------- end automapper ({self.view}) column specs ------" ) self.logger.debug( f"-------- automapper ({self.source_view}) source_df schema ------" ) # noinspection PyProtectedMember self.logger.debug(source_df._jdf.schema().treeString()) # type: ignore self.logger.debug( f"-------- end automapper ({self.source_view}) source_df schema ------" ) if self.check_schema_for_all_columns: for column_name, mapper in self.mappers.items(): check_schema_result: Optional[ CheckSchemaResult ] = mapper.check_schema(parent_column=None, source_df=source_df) if ( check_schema_result and len(check_schema_result.result.errors) > 0 ): self.logger.info( f"==== Schema Mismatch [{column_name}] ===" f"{str(check_schema_result)}" ) else: self.logger.debug(f"==== Schema Matches: [{column_name}] ====") # run all the selects df = source_df.alias("b").select(*column_specs) # write out final checkpoint for this automapper if self.checkpoint_path: checkpoint_path = ( Path(self.checkpoint_path) .joinpath(self.view or "df") .joinpath("final") ) df.write.parquet(str(checkpoint_path)) df = df.sql_ctx.read.parquet(str(checkpoint_path)) except (AnalysisException, ValueError): self.logger.warning( f"-------- automapper ({self.view}) column specs ------" ) self.logger.warning(self.to_debug_string(source_df=source_df)) self.logger.warning( f"-------- end automapper ({self.view}) column specs ------" ) self.logger.debug( f"-------- automapper ({self.source_view}) source_df schema ------" ) # noinspection PyProtectedMember self.logger.debug(source_df._jdf.schema().treeString()) # type: ignore self.logger.debug( f"-------- end automapper ({self.source_view}) source_df schema ------" ) # iterate through each column to find the problem child for column_name, mapper in self.mappers.items(): try: self.logger.debug( f"========= Processing {column_name} =========== " ) column_spec = mapper.get_column_specs(source_df=source_df)[ column_name ] source_df.alias("b").select(column_spec).limit(1).count() self.logger.debug( f"========= Done Processing {column_name} =========== " ) except (AnalysisException, ValueError) as e2: self.logger.error( f"========= Processing {column_name} FAILED =========== " ) column_spec = mapper.get_column_specs(source_df=source_df)[ column_name ] self.logger.warning( ColumnSpecWrapper(column_spec).to_debug_string() ) self.logger.error( f"========= checking schema for failed column {column_name} =========== " ) check_schema_result = mapper.check_schema( parent_column=None, source_df=source_df ) msg: str = "" if isinstance(e2, AnalysisException) and e2.desc.startswith( "cannot resolve 'array" ): msg = ( "Looks like the elements of the array have different structures. " "All items in an array should have the exact same structure. " "You can pass in include_nulls to AutoMapperDataTypeComplexBase to force it to create " "null values for each element in the structure. \n" ) # find the data types of each item in the list column_values: Optional[List[Any]] = None # This can cause GC overhead limit reached error obfuscating the actual error # try: # # get column value in first row # column_values: Optional[List[Any]] = ( # [ # row.asDict(recursive=True)[column_name] # for row in source_df.select(column_name) # .limit(5) # .collect() # ] # if bool(source_df.head(1)) # df is not empty # else None # ) # except Exception as e3: # print(e3) # column_values = None msg += self._get_message_for_exception( column_name=column_name, check_schema_result=check_schema_result, df=df, e=e2, source_df=source_df, column_values=column_values, ) raise AutoMapperAnalysisException( automapper_name=self.view, msg=msg, column_name=column_name, check_schema_result=check_schema_result, column_values=column_values, ) from e2 except Exception as e: self.logger.error("==== OOPS ===========") msg = self._get_message_for_exception( column_name="", check_schema_result=None, df=df, e=e, source_df=source_df, column_values=None, ) raise Exception(msg) from e self.logger.debug(f"========= Finished AutoMapper {self.view} =========== ") return df
def transform_data(clickstream_df: DataFrame, purchases_df: DataFrame, logger: Log4jWrapper) -> DataFrame: """ Builds purchases attribution dataset from mobile app clickstream and user purchases data. :param df: dataset to transform as DataFrame. :return: transformed DataFrame. """ user_window: Window = Window.partitionBy("userId").orderBy("eventTime") logger.warning("build leading events dataset ...") leading_events_df = (clickstream_df.filter( F.col("eventType").isin({"app_open", "app_close"})).withColumn( "nextEventId", F.lead(F.col("eventId"), 1).over(user_window)).select( F.col("userId"), F.col("eventTime").alias("startSessionEventTime"), F.col("eventType").alias("startSessionEventType"), F.col("attributes"), F.col("nextEventId")).filter( F.col("startSessionEventType") == "app_open").withColumn( "campaignId", F.get_json_object( F.col("attributes"), "$.campaign_id")).withColumn( "channelId", F.get_json_object( F.col("attributes"), "$.channel_id")).drop("attributes")) leading_events_df.show() logger.warning("calculate user sessions ...") sessions_df = (leading_events_df.alias("leading_events").join( clickstream_df.alias("all_events"), on=F.col("leading_events.nextEventId") == F.col("all_events.eventId"), how="left").select( F.col("leading_events.userId"), F.col("leading_events.startSessionEventTime"), F.col("leading_events.campaignId"), F.col("leading_events.channelId"), F.col("all_events.eventTime").alias("endSessionEventTime"), ).withColumn("sessionId", F.monotonically_increasing_id())) sessions_df.show() logger.warning("append session to each event ...") sessioned_purchases_df = (clickstream_df.alias("c").filter( F.col("c.eventType") == "purchase").join( sessions_df.alias("s"), on=[ F.col("c.userId") == F.col("s.userId"), F.col("c.eventTime") >= F.col("s.startSessionEventTime"), (F.col("c.eventTime") <= F.col("s.endSessionEventTime")) | F.col("s.endSessionEventTime").isNull(), ]).select( F.col("s.userId"), F.col("s.sessionId"), F.col("s.campaignId"), F.col("s.channelId"), F.get_json_object(F.col("c.attributes"), "$.purchase_id").alias("purchaseId"), ).orderBy(F.col("userId"), F.col("eventTime"))) sessioned_purchases_df.show() logger.warning("build purchases attribution ...") projection_df = (sessioned_purchases_df.alias("s").join( purchases_df.alias("p"), on=F.col("p.purchaseId") == F.col("s.purchaseId")).select( F.col("p.purchaseId"), F.col("p.purchaseTime"), F.col("p.billingCost"), F.col("p.isConfirmed"), F.col("s.sessionId"), F.col("s.campaignId"), F.col("s.channelId"))) projection_df.show() return projection_df