Exemple #1
0
def main():

    # Cargamos los ficheros csv directamente como dataframes
    spark = SparkSession.builder.getOrCreate()
    sc = spark.sparkContext
    df1 = spark.read.format("csv").option(
        "header", "true").load("simpsons_characters.csv")
    df2 = spark.read.format("csv").option("header",
                                          "true").load("simpsons_episodes.csv")
    df3 = spark.read.format("csv").option(
        "header", "true").load("simpsons_locations.csv")
    df4 = spark.read.format("csv").option(
        "header", "true").load("simpsons_script_lines.csv")

    # Obtenemos un df por episode_id y el count de los distintos personajes.
    dfe = df4.groupBy('episode_id').agg(
        functions.approx_count_distinct(
            df4.character_id).alias("character_count")).selectExpr(
                "episode_id as id", "character_count")

    # Obtenemos un df con id, imbd_rating y el count de personajes.
    dfjoin1 = df2.join(dfe, on="id",
                       how='outer').select("id", "imdb_rating",
                                           "character_count")

    # Obtenemos (id, genero) e (id, episodio) y los unimos por los id's de los personajes.
    dfc = df1.select('id', 'gender').selectExpr("id as character_id", "gender")
    dfe = df4.select('character_id', 'episode_id')
    dfjoin2 = dfc.join(dfe, on="character_id",
                       how='outer').select("episode_id", "character_id",
                                           "gender")

    # Para obtener el count de los generos, hacemos un filtro y agrupamos por id de episodio, para después hacer el count de cada episodio.
    dfCharacMasc = dfjoin2.filter(
        dfjoin2.gender == 'm').groupBy('episode_id').agg(
            functions.approx_count_distinct(
                dfjoin2.character_id).alias("masc_count")).selectExpr(
                    "episode_id as id", "masc_count")
    dfCharacFem = dfjoin2.filter(
        dfjoin2.gender == 'f').groupBy('episode_id').agg(
            functions.approx_count_distinct(
                dfjoin2.character_id).alias("fem_count")).selectExpr(
                    "episode_id as id", "fem_count")

    # Por último se unen todos los campos en un mismo dataframe y se muestra.
    dfjoin3 = dfjoin1.join(dfCharacMasc, on="id",
                           how='outer').select("id", "imdb_rating",
                                               "character_count", "masc_count")
    dfjoin4 = dfjoin3.join(dfCharacFem, on="id",
                           how='outer').select("id", "imdb_rating",
                                               "character_count", "masc_count",
                                               "fem_count")

    dfjoin4.show()
    sc.stop()
Exemple #2
0
 def join_to_weekly_budget(df, email_vols):
     return (
         df.join(email_vols, on="prim_party_id").withColumn(
             "email_budget",
             F.col("nonflight")).drop(*["flight", "nonflight"]).select(
                 "*",
                 F.rank().over(
                     Window.partitionBy(F.col("prim_party_id")).orderBy(
                         F.col("score").desc())).alias("rank")).withColumn(
                             "rank_adj",
                             F.when(
                                 F.col("OFFER_TYPE") != "recommended",
                                 F.col("rank") * 100).otherwise(
                                     F.col("rank"))).drop(*["rank"]).
         select(
             "*",
             F.rank().over(
                 Window.partitionBy(F.col("prim_party_id")).orderBy(
                     F.col("rank_adj").desc())).alias('rank')).withColumn(
                         "elig_for_week",
                         F.approx_count_distinct("action_id").over(
                             Window.partitionBy(
                                 F.col("prim_party_id")))).withColumn(
                                     "email_budget_week",
                                     F.when(
                                         F.col("elig_for_week") <
                                         F.col("email_budget"),
                                         F.col("elig_for_week")).otherwise(
                                             F.col("email_budget"))).
         filter(F.col("rank") <= F.col("email_budget_week")).drop(*[
             "email_budget", "score", "rank_adj", "offer_send_time",
             "send_date", "elig_for_week"
         ]))
Exemple #3
0
def perform_eda(data_pipeline, target, feature, nonnull_cutoff,
                except_for_these, save_fig):
    spark = _loadSpark()
    base = spark.sql(data_pipeline)
    base.persist()

    cols_drop = []
    for col_name in base.columns:
        if base.select(col_name).filter((F.col(col_name).isNotNull()) | (F.col(
                col_name).cast('double') == 0.0)).count() <= nonnull_cutoff:
            cols_drop.append(col_name)
    base2 = base.drop(*cols_drop)
    print(
        '*** Dropped null-only or columns having non-null values less than or equal to '
        + str(nonnull_cutoff) + '...')
    print(str(cols_drop))
    base.unpersist()
    base2.persist()

    print('\n*** Excluded features including a target...')
    print(str(except_for_these))

    print('\n*** Excluded timestamp type and single value numeric features...')
    num = []
    cat = []
    for c in base2.drop(*except_for_these).dtypes:
        if c[1] == 'string':
            cat += [c[0]]
            base2 = base2.withColumn(
                c[0],
                F.when((F.col(c[0]) == '') |
                       (F.col(c[0]) == ' ') | (F.col(c[0]).isNull()),
                       F.lit('None')).otherwise(F.col(c[0])))
        if c[1] not in ('string', 'timestamp') and base2.select(
                F.approx_count_distinct(c[0])).collect()[0][0] > 1:
            num += [c[0]]
            base2 = (base2.withColumn(
                c[0],
                F.when((F.col(c[0]) == '') | (F.col(c[0]) == ' '),
                       F.lit(None)).otherwise(F.col(c[0]))).withColumn(
                           c[0],
                           F.col(c[0]).cast('double')))

    print('*** Distribution by target value ***')
    base2.groupBy(target).count().show()

    print('*** Describe categorical variables in the data set ***')
    for i in cat:
        base2.groupBy(i).count().orderBy('count',
                                         ascending=False).show(2000,
                                                               truncate=False)

    print('*** Describe numeric variables in the data set ***')
    for i in num:
        base2.select(i).summary().show(truncate=False)
        if save_fig:
            tmp = base2.select(i).toPandas()
            tmp.hist(figsize=(12, 10))
            plt.savefig('./fig/' + i + '.png')
            plt.close()
Exemple #4
0
def main():

    # Cargamos los ficheros csv directamente como dataframes
    spark = SparkSession.builder.getOrCreate()
    sc = spark.sparkContext
    df1 = spark.read.format("csv").option(
        "header", "true").load("simpsons_characters.csv")
    df2 = spark.read.format("csv").option("header",
                                          "true").load("simpsons_episodes.csv")
    df3 = spark.read.format("csv").option(
        "header", "true").load("simpsons_locations.csv")
    df4 = spark.read.format("csv").option(
        "header", "true").load("simpsons_script_lines.csv")

    dfScriptLines = df4.groupBy('episode_id').agg(
        functions.approx_count_distinct(
            df4.location_id).alias("count")).selectExpr(
                "episode_id as id", "count")

    dfLocations = df2.join(dfScriptLines, on="id",
                           how='outer').select("id", "imdb_rating", "count")

    dfLocations.show()

    sc.stop()
def getColumnUniqueCounts(clickDF):
    """
    Parameters
    ---------- 
    clickDF : class Dataframe 
    """ 
    distvals = clickDF.agg(*(approx_count_distinct(col(c)).alias(c) for c in clickDF.columns if str.startswith(c,"c")))
    print(type(distvals))
    return distvals
Exemple #6
0
    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions
        self.assertEqual((0, u'99'),
                         tuple(g.agg(functions.first(df.key), functions.last(df.value)).first()))
        self.assertTrue(95 < g.agg(functions.approx_count_distinct(df.key)).first()[0])
        self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
Exemple #7
0
    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions
        self.assertEqual((0, u'99'),
                         tuple(g.agg(functions.first(df.key), functions.last(df.value)).first()))
        self.assertTrue(95 < g.agg(functions.approx_count_distinct(df.key)).first()[0])
        self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
Exemple #8
0
    def aggregate_job(self, df_news):

        df_finalized = df_news.groupby('action_state', 'Year', 'Actor1Type1Code') \
            .agg(
            F.approx_count_distinct(
                'GLOBALEVENTID').alias('events_count'),
            F.sum('normg_scale').alias('norm_score_cale'))
        # Calculate avg by dividing by event count

        print df_finalized.show(df_finalized.count())
        #print df_finalized.printSchema()

        return df_finalized
Exemple #9
0
def distinct_count(table, col_names, method="exact"):
    try:
        if method == "exact" or method == None:
            from pyspark.sql.functions import col, countDistinct
            uniques = table.agg(*(countDistinct(col(c)).alias(c) for c in col_names))
        elif method == "approx":
            from pyspark.sql.functions import col, approx_count_distinct
            uniques = table.agg(*(approx_count_distinct(col(c)).alias(c) for c in col_names))
        else:
            raise ValueError("Unknown method {}, choose between ['exact', 'approx']".format(method))
    except:
        print("Cannot resolve column: {}".format(col_names))
    return uniques
Exemple #10
0
def approx_cardinalities(df, cols):
    from functools import reduce
    
    counts = df.groupBy(
        F.lit(True).alias("drop_me")
    ).agg(
        F.count('*').alias("total"),
        *[F.approx_count_distinct(F.col(c)).alias(c) for c in cols]
    ).drop("drop_me").cache()
    
    result = reduce(lambda l, r: l.unionAll(r), [counts.select(F.lit(c).alias("field"), F.col(c).alias("approx_count")) for c in counts.columns]).collect()
    counts.unpersist()
    
    return dict([(r[0],r[1]) for r in result])
Exemple #11
0
    def aggregate_job(self, df_joined):
        '''
		This function is used to perform group by and aggregation on joined data frame
		:param df_joined: joined events and mentions data frame will be provided
		:return: aggregated data frame will be returned.
		'''

        df_finalized = df_joined.groupby('state', 'Year', 'MonthYear', 'Actor1Type1Code') \
         .agg(
          functions.approx_count_distinct('GLOBALEVENTID').alias('events_count'),
          functions.sum('GoldsteinScale_norm').alias('GoldsteinScale_norm_sum'),
          functions.sum('Avg_Confidence').alias('confidence_sum')
          )
        df_finalized.show()
        return df_finalized
Exemple #12
0
def aggregate(uri, conf):
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    parsed_df = spark.read.format('csv').options(header=True,
                                                 inferschema=True).load(uri)

    columns = parsed_df.columns
    identify_aggregations(columns)

    print("unique_params_indices: " + str(unique_params_indices))
    print("numeric_params_indices: " + str(numeric_params_indices))

    # initiate output DF with record count
    cnt_col_name = spark.conf.get("output") + "_count"
    output = parsed_df.groupBy([parsed_df.temp_res, parsed_df.spat_res]) \
                      .count()

    output = output.withColumnRenamed("count", cnt_col_name)

    # select and handle categorical attributes
    cat_col_names = [columns[i] for i in unique_params_indices]
    for name in cat_col_names:
        col = parsed_df.groupBy([parsed_df.temp_res, parsed_df.spat_res]) \
                       .agg(F.approx_count_distinct(name).alias(name+"_uniq"))

        output = output.join(col, ["temp_res", "spat_res"], 'left')

    # select and handle numeric attributes
    num_col_names = [columns[i] for i in numeric_params_indices]
    # fill in NaN numeric values to avoid error
    parsed_df = parsed_df.fillna(0, subset=num_col_names)

    for name in num_col_names:
        col = parsed_df.groupBy([parsed_df.temp_res, parsed_df.spat_res]) \
                       .agg(F.mean(name).alias(name+"_avg"))

        output = output.join(col, ["temp_res", "spat_res"], 'left')

    # write aggregated dataset
    out_dir = spark.conf.get("output")
    out_dir = "aggregates/" + out_dir

    # output.printSchema()
    # output.show(40)
    # print(output.count())
    print("output directory is: " + out_dir)
    output.write.csv(out_dir, header=True)
    spark.stop()
Exemple #13
0
    def group_distinct(self, df_, columns: list) -> dict:
        """

        :param df_: pysaprk df
        :param columns: columns of type str in the dataframe
        :return: dict of columns and distinct values
        """
        data = []
        for c in columns:
            cnt = df_.agg(approx_count_distinct(col(c)).alias(c)).collect()
            data.append(cnt[0][0])
        df_pd = pd.DataFrame(index=columns,
                             columns=["num_distinct"],
                             data=data)
        return df_pd.sort_values("num_distinct", axis=0,
                                 ascending=True).to_dict()["num_distinct"]
Exemple #14
0
    def process_measures_from_grouped_message(self):
        try:
            input_df = self.get_kafka_consumer()
            flat_message_df = self.get_parsed_message(input_df)

            unique_users_measure_df = flat_message_df.groupBy(
                'time_window').agg(
                    F.approx_count_distinct("users").alias(
                        "unique_users")).orderBy('time_window', 'unique_users')

            country_count_df1 = flat_message_df.groupBy(
                'time_window',
                'country').count().orderBy('time_window', 'count')
            most_represented_countries_df = country_count_df1.withColumn(
                "rank_max",
                F.rank().over(
                    Window.partitionBy("time_window").orderBy(F.desc("count")))
            ).where(F.col("rank_max") == 1).orderBy("time_window").select(
                F.col("time_window"),
                F.col("country").alias("most_represented_country"),
                F.col("count").alias("most_represented_country_count"))
            least_represented_countries_df = country_count_df1.withColumn(
                "rank_min",
                F.rank().over(
                    Window.partitionBy("time_window").orderBy(F.col("count")))
            ).where(F.col("rank_min") == 1).orderBy("time_window").select(
                F.col("time_window"),
                F.col("country").alias("least_represented_country"),
                F.col("count").alias("least_represented_country_count"))

            most_represented_country_df = self.get_first_record_from_grouped_data(
                most_represented_countries_df, "most_represented_country")
            least_represented_country_df = self.get_first_record_from_grouped_data(
                least_represented_countries_df, "least_represented_country")

            measures_df = most_represented_country_df.join(least_represented_country_df,"time_window").join(unique_users_measure_df,"time_window")\
                .select(most_represented_countries_df.time_window,F.col("most_represented_country")\
                        ,F.col("most_represented_country_count"),F.col("least_represented_country")\
                        ,F.col("least_represented_country_count"),F.col("unique_users"))

            measures_df.write.format("csv").save(self.output_file_dir)

            #self.logger.info(most_represented_country_df.collect())
            #self.logger.info(least_represented_country_df.collect())
            #self.logger.info(unique_users_measure_df.collect())
        except Exception as e:
            self.logger.error(e)
Exemple #15
0
def removeInvalidCols(df):
    '''
    Remove columns with all Nans or all same values
    Parameter:
        - dataframe
    Returns:
        - dataframe (after removing cols)
    '''
    count_distinct = df.select([
        F.approx_count_distinct(c).alias("{0}".format(c)) for c in df.columns
    ])

    # consider only columns with >= 2 distincts
    distinct = count_distinct.toPandas().to_dict(orient='list')
    to_consider = [k for k in distinct if distinct[k][0] >= 2]

    # return the cleaned dataframe
    return df[to_consider]
Exemple #16
0
def _deduplication_approximated_todo(columns, df):
    """
    Returns what (columns, as in spark columns) to compute to get the results requested by
    the parameters.

    :param columns:
    :type columns: list
    :param df:
    :type df: DataFrame
    :return: Pyspark columns representing what to compute.
    """
    if columns is None:
        print(
            "Approximated count distinct spanning over the whole row is currently not supported"
        )
    else:
        # multiple count distinct, one column each
        todo = [approx_count_distinct(col(c)).alias(c) for c in columns]
    return todo
Exemple #17
0
def main():
    # base streams from kafka
    submission_stream = stream_from_kafka('SubmissionProducer')
    comment_stream = stream_from_kafka('CommentProducer')
    # subreddit_stream = stream_from_kafka('SubredditProducer')
    # redditor_stream = stream_from_kafka('RedditorProducer')

    # persist base streams to hdfs for batch processing
    write_to_hdfs(submission_stream, 'submission')
    write_to_hdfs(comment_stream, 'comment')
    # write_to_hdfs(subreddit_stream, 'subreddit')
    # write_to_hdfs(redditor_stream, 'redditor')

    # streaming stats
    windowed_submissions = submission_stream \
        .groupby(window('created_utc', '5 seconds', '5 seconds')) \
        .count() \
        .withColumn('window', col('window')['end']) \
        .withColumn('date', to_date('window'))

    windowed_comments = comment_stream \
        .groupby(window('created_utc', '5 seconds', '5 seconds')) \
        .count() \
        .withColumn('window', col('window')['end']) \
        .withColumn('date', to_date('window'))

    windowed_unique_active_subreddits = submission_stream \
        .select('created_utc', 'subreddit') \
        .union(comment_stream.select('created_utc', 'subreddit')) \
        .groupby(window('created_utc', '5 seconds', '5 seconds')) \
        .agg(approx_count_distinct('subreddit'), count('subreddit')) \
        .withColumn('window', col('window')['end']) \
        .withColumn('date', to_date('window')) \
        .withColumnRenamed('approx_count_distinct(subreddit)', 'active_subreddits') \
        .withColumnRenamed('count(subreddit)', 'submissions_and_comments')

    # persist streaming stats to cassandra
    stream_to_cassandra(windowed_submissions, 'windowed_submissions')
    stream_to_cassandra(windowed_comments, 'windowed_comments')
    stream_to_cassandra(windowed_unique_active_subreddits,
                        'unique_active_subreddits')

    spark_session.streams.awaitAnyTermination()
Exemple #18
0
def profile(df):
    x = spark.createDataFrame(sc.emptyRDD(), SCHEMA)
    df.cache()
    for field in df.schema.fields:
        if isinstance(field.dataType,
                      (pst.StructField, pst.ArrayType, pst.MapType,
                       pst.StructType, pst.BooleanType)):
            continue
        else:
            y = df.agg(
                F.lit(field.name).alias("desc"),
                F.count(field.name).alias('count'),
                F.min(field.name).alias('min'),
                F.max(field.name).alias('max'),
                F.sum(field.name).alias('sum'),
                F.stddev(field.name).alias('stddev'),
                F.max(F.length(field.name)).alias('max_length'),
                F.approx_count_distinct(field.name).alias('cardinality'))
            x = x.union(y)
    return x
Exemple #19
0
    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100],
                         sorted(
                             g.agg({
                                 "key": "max",
                                 "value": "count"
                             }).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions

        self.assertEqual((0, "99"),
                         tuple(
                             g.agg(functions.first(df.key),
                                   functions.last(df.value)).first()))
        self.assertTrue(
            95 < g.agg(functions.approx_count_distinct(df.key)).first()[0])
        # test deprecated countDistinct
        self.assertEqual(100,
                         g.agg(functions.countDistinct(df.value)).first()[0])
Exemple #20
0
def readFile(spark, input_file):
    '''read file (clickstream.parquet) and add new features to dataframe'''
    parquetFile = spark.read.parquet(input_file)  # clickstream.parquet
    parquetFile = parquetFile.distinct()  # drop dublicates
    w = Window.partitionBy('ad_id')
    parquetFile = parquetFile.withColumn(
        'day_count',
        fn.approx_count_distinct('date').over(w))
    parquetFile = parquetFile.withColumn(
        'is_cpm', (fn.col('ad_cost_type') == 'CPM').cast('integer'))
    parquetFile = parquetFile.withColumn(
        'is_cpc', (fn.col('ad_cost_type') == 'CPC').cast('integer'))

    parquetFile = parquetFile.withColumn(
        'is_click', (fn.col('event') == 'click').cast('integer'))
    parquetFile = parquetFile.withColumn(
        'is_view', (fn.col('event') == 'view').cast('integer'))

    parquetFile = parquetFile.withColumn(
        'CTR',
        fn.when(fn.sum('is_view').over(w) == 0, 0).otherwise(
            fn.sum('is_click').over(w) / fn.sum('is_view').over(w)))
    return parquetFile
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
  
  
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department &amp; Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
Exemple #22
0
def estimate_segments(
    df: pyspark.sql.dataframe.DataFrame,
    target_field: str = None,
    max_segments: int = 30,
    include_columns: List[str] = [],
    unique_perc_bounds: Tuple[float, float] = [None, 0.8],
    null_perc_bounds: Tuple[float, float] = [None, 0.2],
) -> Optional[Union[List[Dict], List[str]]]:
    """
    Estimates the most important features and values on which to segment
    data profiling using entropy-based methods.

    If no target column provided, maximum entropy column is substituted.

    :param df: the dataframe of data to profile
    :param target_field: target field (optional)
    :param max_segments: upper threshold for total combinations of segments,
    default 30
    :param include_columns: additional non-string columns to consider in automatic segmentation. Warning: high cardinality columns will degrade performance.
    :param unique_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of unique values (|unique| / |X|). Upper bound exclusive.
    :param null_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of null values. Upper bound exclusive.
    :return: a list of segmentation feature names
    """
    current_split_columns = []
    segments = []
    segments_used = 1
    max_entropy_column = (float("-inf"), None)

    if not unique_perc_bounds[0]:
        unique_perc_bounds[0] = float("-inf")
    if not unique_perc_bounds[1]:
        unique_perc_bounds[1] = float("inf")
    if not null_perc_bounds[0]:
        null_perc_bounds[0] = float("-inf")
    if not null_perc_bounds[1]:
        null_perc_bounds[1] = float("inf")

    valid_column_names = set()

    count = df.count()

    print("Limiting to categorical (string) data columns...")
    valid_column_names = {col for col in df.columns if (df.select(col).dtypes[0][1] == "string" or col in include_columns)}

    print("Gathering cardinality information...")
    n_uniques = {col: df.agg(F.approx_count_distinct(col)).collect()[0][0] for col in valid_column_names}
    print("Gathering missing value information...")
    n_nulls = {col: df.filter(df[col].isNull()).count() for col in valid_column_names}

    print("Finding valid columns for autosegmentation...")
    for col in valid_column_names.copy():
        null_perc = 0.0 if count == 0 else n_nulls[col] / count
        unique_perc = 0.0 if count == 0 else n_uniques[col] / count
        if (
            col in segments
            or n_uniques[col] <= 1
            or null_perc < null_perc_bounds[0]
            or null_perc >= null_perc_bounds[1]
            or unique_perc < unique_perc_bounds[0]
            or unique_perc >= unique_perc_bounds[1]
        ):
            valid_column_names.remove(col)

    if not valid_column_names:
        return []

    if not target_field:
        print("Finding alternative target field since none were specified...")
        for col in valid_column_names:
            col_entropy = _simple_entropy(df, col)
            if n_uniques[col] > 1:
                col_entropy /= math.log(n_uniques[col])
            if col_entropy > max_entropy_column[0]:
                max_entropy_column = (col_entropy, col)
        target_field = max_entropy_column[1]

    print(f"Using {target_field} column as target field.")
    assert target_field in df.columns
    valid_column_names.add(target_field)
    valid_column_names = list(valid_column_names)

    countdf = df.select(valid_column_names).groupby(valid_column_names).count().cache()

    print("Calculating segments...")
    while segments_used < max_segments:
        valid_column_names = {col for col in valid_column_names if (col not in segments and n_uniques[col] * segments_used <= (max_segments - segments_used))}
        _, segment_column_name = _find_best_split(
            countdf, current_split_columns, list(valid_column_names), target_column_name=target_field, normalization=n_uniques
        )

        if not segment_column_name:
            break

        segments.append(segment_column_name)
        current_split_columns.append(segment_column_name)
        segments_used *= n_uniques[segment_column_name]

    return segments
Exemple #23
0
    def get_data(self):
        """
        Returns statistics about attributes in a data frame
        """

        from pyspark.sql import functions

        # Correlation pairs
        corr_pairs = list(
            chunks(list(itertools.product(self.attrs, self.attrs)),
                   len(self.attrs)))

        # Cache data
        self.data.cache()

        df_count = self.data.count()

        # TODO: Implement median using df.approxQuantile('col', [.5], .25)

        stats = []
        for i, name in enumerate(self.attrs):
            df_col = functions.col(name)
            stats.append(functions.lit(name))
            stats.append(functions.max(df_col).alias('max_{}'.format(name)))
            stats.append(functions.min(df_col).alias('min_{}'.format(name)))
            if name in self.numeric_attrs:
                stats.append(
                    functions.round(functions.stddev(df_col),
                                    4).alias('stddev_{}'.format(name)))
            else:
                stats.append(functions.lit('-'))
            stats.append(
                functions.count(df_col).alias('count_{}'.format(name)))
            if name in self.numeric_attrs:
                stats.append(
                    functions.round(functions.avg(df_col),
                                    4).alias('avg_{}'.format(name)))
            else:
                stats.append(functions.lit('-'))

            stats.append(
                functions.approx_count_distinct(df_col).alias(
                    'distinct_{}'.format(name)))
            stats.append((df_count - functions.count(df_col)).alias(
                'missing_{}'.format(name)))

            if name in self.numeric_attrs:
                stats.append(
                    functions.round(functions.skewness(df_col),
                                    2).alias('skewness_{}'.format(name)))
                stats.append(
                    functions.round(functions.kurtosis(df_col),
                                    2).alias('kurtosis_{}'.format(name)))
            else:
                stats.append(functions.lit('-'))
                stats.append(functions.lit('-'))

            if self.params['correlation']:
                for pair in corr_pairs[i]:
                    if all([
                            pair[0] in self.numeric_attrs, pair[1]
                            in self.numeric_attrs
                    ]):
                        stats.append(
                            functions.round(functions.corr(*pair),
                                            4).alias('corr_{}'.format(i)))
                    else:
                        stats.append(functions.lit('-'))

        self.data = self.data.agg(*stats)
        aggregated = self.data.take(1)[0]
        n = len(self.names)
        rows = [aggregated[i:i + n] for i in range(0, len(aggregated), n)]

        return {"rows": rows, "attributes": self.get_column_names().split(',')}
Exemple #24
0
 def _get_approx_distinct_count_for_col(self,
                                        df: DataFrame,
                                        _dcol: str,
                                        _rsd=0.05):
     return df.select(F.approx_count_distinct(col(_dcol), rsd=_rsd)) \
         .rdd.map(lambda row: row[0]).collect()[0]
data = data.select(
    col("data").getItem(0).alias("TS"),
    col("data").getItem(1).alias("UID"),
    col("data").getItem(2).alias("URL"),
    col("data").getItem(3).alias("Title"),
    col("data").getItem(4).alias("User-Agent"),
)

data = data.selectExpr("TS", "UID", 'parse_url(URL, "HOST") as domain')

data = data.withColumn("TS", from_unixtime(col("TS").cast("int")))

data = data.withColumn(
    "zone",
    when(col("domain").endswith(".ru"), "ru").otherwise("not ru"))

data = data.groupBy("zone", window("TS", "2 seconds", "1 second")).agg(
    count("UID").alias("view"),
    approx_count_distinct(col("UID")).alias("unique"))

data = data.sort(["window", "view", "zone"], ascending=[True, False,
                                                        True]).limit(20)
data = data.select(col("window"), col("zone"), col("view"), col("unique"))

query = (data.writeStream.outputMode("complete").format("console").option(
    "truncate", "false").trigger(once=args.once,
                                 processingTime=args.processing_time).start())

query.awaitTermination()
        StructField("merchant_id", LongType(), True),
        StructField("created_time_ist", StringType(), True)
    ])

    txnFctDf = sparkSession.createDataFrame(txnFctRdd, txnFctSchema)
    txnFctDf.printSchema()
    txnFctDf.show(5, False)

    # Applying tranformation on dataframe using DSL (Domain Specific Language)
    txnFctDf = txnFctDf\
        .withColumn("created_time_ist", unix_timestamp(txnFctDf["created_time_ist"], "yyyy-MM-dd HH:mm:ss").cast(TimestampType()))

    txnFctDf.printSchema()
    txnFctDf.show(5, False)

    print("# of records = " + str(txnFctDf.count()))
    print("# of merchants = " +
          str(txnFctDf.select(txnFctDf["merchant_id"]).distinct().count()))

    txnAggDf = txnFctDf\
        .repartition(10, txnFctDf["merchant_id"])\
        .groupBy("merchant_id")\
        .agg(sum("amount"), approx_count_distinct("status"))

    txnAggDf.show(5, False)

    txnAggDf\
        .withColumnRenamed("sum(amount)", "total_amount")\
        .withColumnRenamed("approx_count_distinct(status)", "dist_status_count")\

    txnAggDf.show(5, False)
    df_news.show(10)

    df_news = df_news.filter(df_news.ActionGeo_CountryCode == 'US')
    df_news = df_news.filter(df_news.Actor1Code != 'null')
    df_news = df_news.filter(df_news.Actor1Type1Code != 'null')

    name = 'ActionGeo_ADM1Code'
    udf = UserDefinedFunction(lambda x: x[:2]+'-'+x[2:], StringType())
    df_news = df_news.select(
        *[udf(column).alias(name) if column == name else column for column in df_news.columns])

    split_col = F.split(df_news['ActionGeo_ADM1Code'], '-')
    df_news = df_news.withColumn('action_state', split_col.getItem(1))
    print df_news.show()

    df_news = df_news.groupby('action_state', 'Year', 'Actor1Type1Code').agg(F.approx_count_distinct('GLOBALEVENTID').alias('event_code'),
                                                                             # F.col('SQLDATE'),
                                                                             # F.collect_list('EventCode'),
                                                                             # F.col('Actor1Name'),
                                                                             # F.col('Actor2Name'),
                                                                             # F.collect_list('ActionGeo_ADM1Code'),
                                                                             # F.collect_list('action_state'),
                                                                             # F.collect_list('ActionGeo_Fullname'),
                                                                             F.avg('GoldsteinScale').alias(
        'goldstein_scale'),
        F.avg('AvgTone').alias('avg_tone'))

    print df_news.show(df_news.count())
    print df_news.printSchema()

    postgres_dump(config, df_news)
Exemple #28
0
df.createOrReplaceTempView("dfTable")

# COMMAND ----------

from pyspark.sql.functions import count
df.select(count("StockCode")).show()  # 541909

# COMMAND ----------

from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show()  # 4070

# COMMAND ----------

from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show()  # 3364

# COMMAND ----------

from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()

# COMMAND ----------

from pyspark.sql.functions import min, max
df.select(min("Quantity"), max("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import sum
df.select(sum("Quantity")).show()  # 5176450
Exemple #29
0
    def run_pipeline(self):
        try:
            logging.info(
                "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/"
            )
            # check collect_list and collect_set
            #collect_set() function returns all values from an input column with duplicate values eliminated.
            #collect_list() function returns all values from an input column with duplicates

            logging.info(
                'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/'
            )
            simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600),
                          ("Robert", "Sales", 4100),
                          ("Maria", "Finance", 3000), ("James", "Sales", 3000),
                          ("Scott", "Finance", 3300), ("Jen", "Finance", 3900),
                          ("Jeff", "Marketing", 3000),
                          ("Kumar", "Marketing", 2000),
                          ("Saif", "Sales", 4100)]
            schema = ["employee_name", "department", "salary"]

            df = self.spark.createDataFrame(data=simpleData,
                                            schema=schema).cache()
            df.show(truncate=False)

            from pyspark.sql.functions import approx_count_distinct, collect_list
            from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count
            from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness
            from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
            from pyspark.sql.functions import variance, var_samp, var_pop
            df.printSchema()
            df.show(truncate=False)

            print("approx_count_distinct: " + \
                  str(df.select(approx_count_distinct("salary")).collect()[0][0]))

            print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

            df.select(collect_list("salary")).show(truncate=False)

            df.select(collect_set("salary")).show(truncate=False)

            df2 = df.select(countDistinct("department", "salary"))
            df2.show(truncate=False)
            print("Distinct Count of Department & Salary: " +
                  str(df2.collect()[0][0]))

            print("count: " + str(df.select(count("salary")).collect()[0]))
            dffirst = df.select(first("salary"))
            dffirst.show(truncate=False)
            df.select(last("salary")).show(truncate=False)
            df.select(kurtosis("salary")).show(truncate=False)
            df.select(max("salary")).show(truncate=False)
            df.select(min("salary")).show(truncate=False)
            df.select(mean("salary")).show(truncate=False)
            df.select(skewness("salary")).show(truncate=False)
            df.select(stddev("salary"), stddev_samp("salary"), \
                      stddev_pop("salary")).show(truncate=False)
            df.select(sum("salary")).show(truncate=False)
            df.select(sumDistinct("salary")).show(truncate=False)
            df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \
                .show(truncate=False)

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return
Exemple #30
0
def create_base(_undersample,undersample_rate, _save_fig,data_pipeline, target,except_for_these,nonnull_cutoff,unique_num_value,balThreshold,n_feat, use_wgt):
    base = spark.sql(data_pipeline)
    base.persist()
    
    #Drop all Nulls or less than a threshold
    cols_drop = []
    for col_name in base.drop(*except_for_these).columns:
        if base.select(col_name).filter((F.col(col_name).isNotNull()) | (F.col(col_name).cast('double') == 0.0)).count() <= nonnull_cutoff:
            cols_drop.append(col_name)
    base2 = base.drop(*cols_drop)
    print('*** Dropped null-only or columns having non-null values less than or equal to ' + str(nonnull_cutoff) + '...')
    print(str(cols_drop))
    base.unpersist()
    base2.persist()
    
    print('\n*** Excluded features including a target...')
    print(str(except_for_these))
    
    num = []
    cat = []
    for c in base2.drop(*except_for_these).dtypes:
        if c[1] == 'string':
            cat += [c[0]]
            base2 = base2.withColumn(c[0], F.when((F.col(c[0]) == '') | (F.col(c[0]) == ' ') | (F.col(c[0]).isNull()), F.lit('None')).otherwise(F.col(c[0])))
        if c[1] not in ('string', 'timestamp') and base2.select(F.approx_count_distinct(c[0])).collect()[0][0] > unique_num_value:
            num += [c[0]]
            base2 = ( base2.withColumn(c[0], F.when((F.col(c[0]) == '') | (F.col(c[0]) == ' '), F.lit(None)).otherwise(F.col(c[0])))
                           .withColumn(c[0], F.col(c[0]).cast('double')))
    
    print('\n*** Distribution by target value ***')
    base2.groupBy(target).count().show()
    
    # Create batches for faster processing
    base = (df1.join(df_seg, ['party_id', 'cmdb_partition'], 'left').withColumn('rnum',F.row_number().over(W.orderBy('party_id'))))
    # Replace empty strings to None string or null
    n_batch = 20
    batch_size = int(n_sample/n_batch)
    total_nulls = {}
    dfs = {}
    print('\n*** Replace empty strings to None string or null...***')
    for i in range(n_batch):
        print('*** batch :' + str(i+1) + ' out of ' + str(n_batch) + ' with a size of ' + str(batch_size))
        lb = i* batch_size + 1
        ub = i * batch_size + batch_size
        df_b = base.filter((F.col('rnum') >= lb) & (F.col('rnum') <= ub))
        for c in df_b.drop(*except_for_these).columns:
            if c in cat:
                df_b = df_b.withColumn(c, F.when((F.col(c) == '') | (F.col(c) == ' ') | (F.col(c).isNull()), F.lit('None')).otherwise(F.col(c)))
            if c in num:
                df_b = (df_b.withColumn(c, F.when((F.col(c) == '') | (F.col(c) == ' '), F.lit(None)).otherwise(F.col(c)))
                           .withColumn(c, F.col(c).cast('double')))
            nulls = df_b.select(c).filter((F.col(c).isNull()) | (F.col(c) == 'None')).count()
            total_nulls.setdefault(c, []).append(nulls)
        dfs[str(i)]=df_b
    
    drop_list = []
    for col, nulls in total_nulls.items():
        print('Column : ' + col + ' - null count : ' + str(nulls))
        if sum(nulls) > 0:
            drop_list.append(col)
    
    print('\n*** Drop list ***')
    print(str(drop_list))
    
    print('\n*** Merge batches into one data frame...')
    from functools import reduce
    def unionAll(dfs):
        return reduce(lambda df1, df2: df1.unionByName(df2), dfs)
    base2 = (unionAll(list(dfs.values())).drop(*drop_list))
    print('*** Merged as base2 ...')
    
    #Check imbalance - weighting for the minority class
    total_size = float(base2.count())
    tp_size = float(base2.select(target).filter(F.col(target) == 1).count())
    tn_size = float(total_size - tp_size)
    if tp_size/tn_size < balThreshold:
        print("Imbalance issue exists....target vs non-target ratio : " + str(tp_size/tn_size))
        if use_wgt:
            class_weight = tn_size / total_size
            base2 = base2.withColumn('classWeight', F.when(F.col(target) == 0, class_weight).otherwise(1-class_weight))
        else:
            pass
    else:
        pass
    
    print('\n*** Describe categorical variables in the data set ***')
    print(str(cat))
    for i in cat:    
        base2.groupBy(i).count().orderBy('count', ascending=False).show(2000,truncate=False)
    
    print('*** Describe numeric variables in the data set ***')
    print(str(num))
    for i in num:
        base2.select(i).summary().show(truncate=False)
        if _save_fig:
            tmp = base2.select(i).toPandas()
            tmp.hist(figsize=(12,10))
            plt.savefig('./fig/'+i+'.png')
            plt.close()    
    
    if _undersample:
        print('*** Undersampling major class due to imbalanced data ***')
        base_tp = base2.filter(F.col(target) == 1)
        _count = base_tp.count()
        print('*** Target size :' + str(_count*undersample_rate))
        base_tn = (base2.filter(F.col(target) == 0)
                .withColumn('rand_num', F.lit(F.rand(_seed)))
                .withColumn('row_num', F.row_number().over(W.orderBy('rand_num')))
                .filter(F.col('row_num') <= _count*undersample_rate)
                .drop('rand_num', 'row_num'))
        df = base_tp.unionByName(base_tn)
    else:
        df = base2
        
    base2.unpersist()    
    return df, num, cat
Exemple #31
0
def main():

    # Cargamos los ficheros csv directamente como dataframes
    spark = SparkSession.builder.getOrCreate()
    sc = spark.sparkContext
    df1 = spark.read.format("csv").option(
        "header", "true").load("simpsons_characters.csv")
    df2 = spark.read.format("csv").option("header",
                                          "true").load("simpsons_episodes.csv")
    df3 = spark.read.format("csv").option(
        "header", "true").load("simpsons_locations.csv")
    df4 = spark.read.format("csv").option(
        "header", "true").load("simpsons_script_lines.csv")

    #A) numero de ubicaciones diferentes que aparecen en cada episodio
    dfScriptLines = df4.groupBy('episode_id').agg(
        functions.approx_count_distinct(
            df4.location_id).alias("count")).selectExpr(
                "episode_id as id", "count")
    dfLocations = df2.join(dfScriptLines, on="id",
                           how='outer').select("id", "imdb_rating", "count")
    dfaux1 = dfLocations.withColumn(
        "imdb_rating_double", dfLocations["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("count_double",
                               dfLocations["count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(dfaux2.imdb_rating_double,
                       dfaux2.count_double).alias('Pearson-Imdb-NumLocations'))
    dfcof.show()

    #B) numero de personajes que aparecen en cada episodio
    dfe = df4.groupBy('episode_id').agg(
        functions.approx_count_distinct(
            df4.character_id).alias("character_count")).selectExpr(
                "episode_id as id", "character_count")
    dfjoin1 = df2.join(dfe, on="id",
                       how='outer').select("id", "imdb_rating",
                                           "character_count")
    dfaux1 = dfjoin1.withColumn("imdb_rating_double",
                                dfLocations["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("character_count_double",
                               dfaux1["character_count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(
            dfaux2.imdb_rating_double,
            dfaux2.character_count_double).alias('Pearson-Imdb-NumCharacters'))
    dfcof.show()

    #C y D) numero de personajes masculinos y femeninos que aparecen en cada episodio
    dfc = df1.select('id', 'gender').selectExpr("id as character_id", "gender")
    dfe = df4.select('character_id', 'episode_id')
    dfjoin2 = dfc.join(dfe, on="character_id",
                       how='outer').select("episode_id", "character_id",
                                           "gender")
    dfCharacMasc = dfjoin2.filter(
        dfjoin2.gender == 'm').groupBy('episode_id').agg(
            functions.approx_count_distinct(
                dfjoin2.character_id).alias("masc_count")).selectExpr(
                    "episode_id as id", "masc_count")
    dfCharacFem = dfjoin2.filter(
        dfjoin2.gender == 'f').groupBy('episode_id').agg(
            functions.approx_count_distinct(
                dfjoin2.character_id).alias("fem_count")).selectExpr(
                    "episode_id as id", "fem_count")
    dfjoin3 = dfjoin1.join(dfCharacMasc, on="id",
                           how='outer').select("id", "imdb_rating",
                                               "character_count", "masc_count")
    dfjoin4 = dfjoin3.join(dfCharacFem, on="id",
                           how='outer').select("id", "imdb_rating",
                                               "character_count", "masc_count",
                                               "fem_count")
    dfaux1 = dfjoin4.withColumn("imdb_rating_double",
                                dfjoin4["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("masc_count_double",
                               dfaux1["masc_count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(
            dfaux2.imdb_rating_double,
            dfaux2.masc_count_double).alias('Pearson-Imdb-NumCharactersMasc'))
    dfcof.show()
    dfaux1 = dfjoin4.withColumn("imdb_rating_double",
                                dfjoin4["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("fem_count_double",
                               dfaux1["fem_count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(
            dfaux2.imdb_rating_double,
            dfaux2.fem_count_double).alias('Pearson-Imdb-NumCharactersFem'))
    dfcof.show()

    #E) numero de palabras
    dfe = df4.groupBy('episode_id').agg(
        functions.sum(df4.word_count).alias("word_count")).selectExpr(
            "episode_id as id", "word_count")
    dfjoin1 = df2.join(dfe, on="id",
                       how='outer').select("id", "imdb_rating", "word_count")
    dfaux1 = dfjoin1.withColumn("imdb_rating_double",
                                dfjoin1["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("word_count_double",
                               dfaux1["word_count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(
            dfaux2.imdb_rating_double,
            dfaux2.word_count_double).alias('Pearson-Imdb-NumWords'))
    dfcof.show()

    #F) cantidad total de diálogos
    dfe2 = df4.filter(df4.speaking_line == True).groupBy('episode_id').agg(
        functions.count(df4.raw_text).alias("raw_text_count")).selectExpr(
            "episode_id as id", "raw_text_count")
    dfjoin2 = dfjoin1.join(dfe2, on="id",
                           how='outer').select("id", "imdb_rating",
                                               "word_count", "raw_text_count")
    dfaux1 = dfjoin2.withColumn("imdb_rating_double",
                                dfjoin1["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("raw_text_count_double",
                               dfaux1["raw_text_count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(
            dfaux2.imdb_rating_double,
            dfaux2.raw_text_count_double).alias('Pearson-Imdb-NumRawText'))
    dfcof.show()

    sc.stop()
# COMMAND ----------

from pyspark.sql.functions import count
df.select(count("StockCode")).show() # 541909


# COMMAND ----------

from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show() # 4070


# COMMAND ----------

from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364


# COMMAND ----------

from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()


# COMMAND ----------

from pyspark.sql.functions import min, max
df.select(min("Quantity"), max("Quantity")).show()


# COMMAND ----------