def __get_intersections(self, dataframe: SparkDataFrame, ground_truth: SparkDataFrame) -> Tuple[int, int]: tmp_label = 'tmp_label' intersected = dataframe.select(self.intersection_column, self.match_column).join( ground_truth.select( self.intersection_column, self.match_column), self.intersection_column) intersected = intersected.withColumn( tmp_label, F.when( dataframe[self.match_column] == ground_truth[ self.match_column], 1).otherwise(0)) if self.friendly_precision: intersected = intersected.withColumn( tmp_label, F.max(tmp_label).over( Window.partitionBy(self.intersection_column))) intersected.persist() all_positives_count = intersected.count() true_positive_count = intersected.filter(F.col(tmp_label) == 1).count() intersected.unpersist() return true_positive_count, all_positives_count
def join_3_tables_by_streamer_and_timestamp( table_1: dataframe, table_2: dataframe, table_3: dataframe, table_1_alias: str = 'youtube_', table_2_alias: str = 'twitch_', table_3_alias: str = 'twitter_' ) -> dataframe: table_1 = table_1.withColumnRenamed('follower_count', table_1_alias + 'count') table_2 = table_2\ .withColumnRenamed('streamer', table_2_alias + 'streamer')\ .withColumnRenamed('timestamp', table_2_alias + 'timestamp')\ .withColumnRenamed('follower_count', table_2_alias + 'count') table_3 = table_3\ .withColumnRenamed('streamer', table_3_alias + 'streamer')\ .withColumnRenamed('timestamp', table_3_alias + 'timestamp')\ .withColumnRenamed('follower_count', table_3_alias + 'count') joined_tables = table_1\ .join(table_2, (table_1['streamer'] == table_2[table_2_alias + 'streamer']) & (table_1['timestamp'] == table_2[table_2_alias + 'timestamp']))\ .join(table_3, (table_1['streamer'] == table_3[table_3_alias + 'streamer']) & (table_1['timestamp'] == table_3[table_3_alias + 'timestamp'])) return joined_tables.select('streamer', 'timestamp', table_1_alias + 'count', table_2_alias + 'count', table_3_alias + 'count')
def spark_unified_aggregator(udf_agg: udf, table_df: dataframe, partition_key: str = 'streamer', clustering_key: str = 'timestamp', clustering_key_alias: str = 'timestamp') -> dataframe: return table_df.groupBy(partition_key, udf_agg(clustering_key).alias(clustering_key_alias)) \ .agg(sql_mean('youtube_count').alias('youtube_count'), sql_mean('twitter_count').alias('twitter_count'), sql_mean('twitch_count').alias('twitch_count'), sql_mean('total_count').alias('total_count') )
def cleanColumnNamesForParquet(df: Dataframe) -> Dataframe: newColumns = [] nchars = ',;{}()=' for c in df.columns: c = c.lower() c = c.replace(' ','_') for c in nchars: c = c.replace(c,'') newColumns.append(c) df = df.toDF(*newColumns) return df
def noaa_transform(df: Dataframe, columns: Dict[str, List]) -> Dataframe: #FRSHTT def __f(r, idx): v = list(str(x)) return (v[idx] == '0' and False) or True transformFRSHTT = spark.udf("transformFRSHTT", __f, BooleanType()) for i, c in enumerate(columns['FRSHTT']): df = df.withColumn(c, transformFRSHTT(df['FRSHTT'], spark_lit(i))) return df
def __fields_from_dataframe(self, dataframe_object: dataframe, is_string: bool) -> list: text_fields = [] first_row = dataframe_object.first() if is_string: for column in dataframe_object.schema.names: if type(first_row[column]) == str: text_fields.append(column) else: for column in dataframe_object.schema.names: if type(first_row[column]) != str: text_fields.append(column) return text_fields
def __save_classifier_result(self, predicted_df: dataframe, filename_metadata: dict) -> None: self.__database.update_one( filename_metadata["datasetName"], filename_metadata, {self.DOCUMENT_ID_NAME: self.METADATA_DOCUMENT_ID}) document_id = 1 for row in predicted_df.collect(): row_dict = row.asDict() row_dict["_id"] = document_id row_dict["probability"] = row_dict["probability"].toArray().tolist( ) document_id += 1 del row_dict["features"] del row_dict["rawPrediction"] self.__database.insert_one_in_file( filename_metadata["datasetName"], row_dict) self.__metadata_creator.update_finished_flag( filename_metadata["datasetName"], True)
def spark_live_aggregator(udf_agg: udf, table_df: dataframe, initial_value: str, new_value_alias: str, partition_key: str = 'streamer', clustering_key: str = 'timestamp', clustering_key_alias: str = 'timestamp') -> dataframe: return table_df.groupBy(partition_key, udf_agg(clustering_key).alias(clustering_key_alias)) \ .agg(sql_mean(initial_value).alias(new_value_alias))
def write_to_hdfs(self, df: dataframe, filename): df.coalesce(1).write.mode('overwrite').csv(hdfs_prefix + filename, header='true')