Ejemplo n.º 1
0
    def __get_intersections(self, dataframe: SparkDataFrame,
                            ground_truth: SparkDataFrame) -> Tuple[int, int]:
        tmp_label = 'tmp_label'
        intersected = dataframe.select(self.intersection_column,
                                       self.match_column).join(
                                           ground_truth.select(
                                               self.intersection_column,
                                               self.match_column),
                                           self.intersection_column)
        intersected = intersected.withColumn(
            tmp_label,
            F.when(
                dataframe[self.match_column] == ground_truth[
                    self.match_column], 1).otherwise(0))
        if self.friendly_precision:
            intersected = intersected.withColumn(
                tmp_label,
                F.max(tmp_label).over(
                    Window.partitionBy(self.intersection_column)))

        intersected.persist()
        all_positives_count = intersected.count()
        true_positive_count = intersected.filter(F.col(tmp_label) == 1).count()
        intersected.unpersist()
        return true_positive_count, all_positives_count
Ejemplo n.º 2
0
def join_3_tables_by_streamer_and_timestamp(
        table_1: dataframe, table_2: dataframe, table_3: dataframe,
        table_1_alias: str = 'youtube_', table_2_alias: str = 'twitch_', table_3_alias: str = 'twitter_'
) -> dataframe:
    table_1 = table_1.withColumnRenamed('follower_count', table_1_alias + 'count')

    table_2 = table_2\
        .withColumnRenamed('streamer', table_2_alias + 'streamer')\
        .withColumnRenamed('timestamp', table_2_alias + 'timestamp')\
        .withColumnRenamed('follower_count', table_2_alias + 'count')

    table_3 = table_3\
        .withColumnRenamed('streamer', table_3_alias + 'streamer')\
        .withColumnRenamed('timestamp', table_3_alias + 'timestamp')\
        .withColumnRenamed('follower_count', table_3_alias + 'count')

    joined_tables = table_1\
        .join(table_2,
              (table_1['streamer'] == table_2[table_2_alias + 'streamer'])
              & (table_1['timestamp'] == table_2[table_2_alias + 'timestamp']))\
        .join(table_3,
              (table_1['streamer'] == table_3[table_3_alias + 'streamer'])
              & (table_1['timestamp'] == table_3[table_3_alias + 'timestamp']))

    return joined_tables.select('streamer', 'timestamp', table_1_alias + 'count',
                                table_2_alias + 'count', table_3_alias + 'count')
Ejemplo n.º 3
0
def spark_unified_aggregator(udf_agg: udf, table_df: dataframe, partition_key: str = 'streamer',
                             clustering_key: str = 'timestamp', clustering_key_alias: str = 'timestamp') -> dataframe:

    return table_df.groupBy(partition_key, udf_agg(clustering_key).alias(clustering_key_alias)) \
                   .agg(sql_mean('youtube_count').alias('youtube_count'),
                        sql_mean('twitter_count').alias('twitter_count'),
                        sql_mean('twitch_count').alias('twitch_count'),
                        sql_mean('total_count').alias('total_count')
                        )
Ejemplo n.º 4
0
def cleanColumnNamesForParquet(df: Dataframe) -> Dataframe:
    newColumns = []
    nchars = ',;{}()='
    for c in df.columns:
        c = c.lower()
        c = c.replace(' ','_')
        for c in nchars:
            c = c.replace(c,'')
        newColumns.append(c)

    df = df.toDF(*newColumns)
    return df
Ejemplo n.º 5
0
def noaa_transform(df: Dataframe, columns: Dict[str, List]) -> Dataframe:

    #FRSHTT
    def __f(r, idx):
        v = list(str(x))
        return (v[idx] == '0' and False) or True

    transformFRSHTT = spark.udf("transformFRSHTT", __f, BooleanType())

    for i, c in enumerate(columns['FRSHTT']):
        df = df.withColumn(c, transformFRSHTT(df['FRSHTT'], spark_lit(i)))

    return df
Ejemplo n.º 6
0
    def __fields_from_dataframe(self, dataframe_object: dataframe,
                                is_string: bool) -> list:
        text_fields = []
        first_row = dataframe_object.first()

        if is_string:
            for column in dataframe_object.schema.names:
                if type(first_row[column]) == str:
                    text_fields.append(column)
        else:
            for column in dataframe_object.schema.names:
                if type(first_row[column]) != str:
                    text_fields.append(column)

        return text_fields
Ejemplo n.º 7
0
    def __save_classifier_result(self, predicted_df: dataframe,
                                 filename_metadata: dict) -> None:
        self.__database.update_one(
            filename_metadata["datasetName"], filename_metadata,
            {self.DOCUMENT_ID_NAME: self.METADATA_DOCUMENT_ID})

        document_id = 1
        for row in predicted_df.collect():
            row_dict = row.asDict()
            row_dict["_id"] = document_id
            row_dict["probability"] = row_dict["probability"].toArray().tolist(
            )

            document_id += 1

            del row_dict["features"]
            del row_dict["rawPrediction"]

            self.__database.insert_one_in_file(
                filename_metadata["datasetName"], row_dict)

        self.__metadata_creator.update_finished_flag(
            filename_metadata["datasetName"], True)
Ejemplo n.º 8
0
def spark_live_aggregator(udf_agg: udf, table_df: dataframe, initial_value: str, new_value_alias: str,
                          partition_key: str = 'streamer', clustering_key: str = 'timestamp',
                          clustering_key_alias: str = 'timestamp') -> dataframe:

    return table_df.groupBy(partition_key, udf_agg(clustering_key).alias(clustering_key_alias)) \
                   .agg(sql_mean(initial_value).alias(new_value_alias))
Ejemplo n.º 9
0
 def write_to_hdfs(self, df: dataframe, filename):
     df.coalesce(1).write.mode('overwrite').csv(hdfs_prefix + filename,
                                                header='true')