def process_unique_time_distribution(valid_rdd, process_properties, data_dict, stats_collector, acc_dict):

    if process_properties.get('unique_time_distribution', None):

        column_number_list = process_properties['unique_time_distribution']['columns']

        for column_number in column_number_list:
            column_name = data_dict[column_number]['name']

            stats_key = 'unique_time_distribution_' + column_name

            frequency = process_properties['unique_time_distribution'].get('frequency', 1)
            frequency_unit = process_properties['unique_time_distribution'].get('unit', 'hours')
            timestamp_format = process_properties['timestamp_format']

            gap_rdd = (valid_rdd
                       .map(lambda x: (x[column_number],
                                       round_datetime(construct_datetime(x, timestamp_format, data_dict),
                                                      frequency, frequency_unit))))

            gap_df = gap_rdd.toDF().selectExpr('_2 as DATETIME', '_1 as COLUMN')

            stats_collector[stats_key] = (gap_df
                                          .groupBy('DATETIME').agg(countDistinct('COLUMN').alias('UNIQUE_COLUMN'))
                                          .rdd
                                          .map(lambda x: [x['DATETIME'], x['UNIQUE_COLUMN']])
                                          .collect())

            print stats_collector[stats_key]
def get_num_distinct_ctus_per_partyid(imputed_df):
    """
    Calculating number of distinct
    CTUs per customer
    """
    distinct_ctus_per_partyid = imputed_df.groupby('party_id').agg(countDistinct("CTU"))\
        .select("party_id", col("count(CTU)").alias("distinct_ctus"))
    return distinct_ctus_per_partyid
def agg_fn(df: DataFrame, group_by_cols):
    gbc = [col(x) for x in group_by_cols]

    assoc_df = (df.groupBy(*gbc).agg(
        countDistinct(col("pmid")).alias("f"),
        count(col("pmid")).alias("N")))

    return assoc_df
def cnt_dates_cnt_ifa(df, OUTPUT_DATA_PATH):
    cnt_dates_cnt_ifa = df.groupBy("c_dates").agg(
        F.countDistinct("ifa").alias('count_ifa'))
    # write to s3 : small file
    cnt_dates_cnt_ifa_path = OUTPUT_DATA_PATH + 'cnt_dates_cnt_ifa.csv'
    cnt_dates_cnt_ifa.repartition(1).write.csv(cnt_dates_cnt_ifa_path,
                                               header=True,
                                               mode='overwrite')
Exemple #5
0
def get_average_num_ps1s(df):
  return df.where(
    col('top_level_url').isin(all_top)
  ).groupBy('top_level_url').agg(countDistinct('ps1').alias("count")).select(
    mean('count').alias('mean'),
    expr('percentile_approx(count, 0.5)').alias('median'),
    stddev('count').alias('std')
  ).show()
Exemple #6
0
def manufacturer_distribution():

    df = sqlContext.read.load('harddrive.csv',
                              delimiter=';',
                              format='com.databricks.spark.csv',
                              header='true',
                              inferSchema='true')
    dg = df.groupBy("MFG").agg(F.countDistinct("model_new"))
Exemple #7
0
 def process_data(data, begin, date):
     return (data.filter(col('submission_date_s3') > begin).filter(
         col('submission_date_s3') <= date).groupBy('country').agg(
             (sample_factor *
              countDistinct('client_id')).alias(active_users_col)).select(
                  '*',
                  lit(begin).alias(start_date_col),
                  lit(date).alias('submission_date_s3')))
def create_categorical_feature_map(df, categorical_features):
    temp_data = df.agg(*(countDistinct(col(c)).alias(c)
                         for c in df.columns)).collect()[0]
    data_map = {}
    for feature_indx in range(0, len(df.columns)):
        if temp_data.__fields__[feature_indx] in categorical_features:
            data_map[feature_indx] = int(temp_data[feature_indx])
    return data_map
Exemple #9
0
def stats(col: str) -> F.Column:
    return F.struct(
        F.min(col).alias('min'),
        F.max(col).alias('max'),
        F.avg(col).alias('avg'),
        F.count(col).alias('count'),
        F.countDistinct(col).alias('countDistinct'),
    )
def calculate_spending_patterns(orderInfoDF):
    # generate entity summary
    usrAggregates = orderInfoDF.groupby(entity) \
      .agg(F.countDistinct(uID).alias("nCustomers"),
        F.sum(spend).alias("spend"),
        F.count(orderID).alias("orders"))

    return usrAggregates
Exemple #11
0
def get_user_demo_metrics(addons_expanded):
    """
    :param addons_expanded: addons_expanded dataframe
    :return: aggregated dataframe by addon_id
        with user demographic information including
        - distribution of users by operating system
        - distribution of users by country
    """
    def source_map(df, extra_filter=""):
        m = F.create_map(
            list(
                chain(*((F.lit(name), F.col(name)) for name in df.columns
                        if name != "addon_id")))).alias(extra_filter)
        return m

    client_counts = (addons_expanded.groupBy("addon_id").agg(
        F.countDistinct("client_id").alias("total_clients")).cache())

    os_dist = (addons_expanded.withColumn("os", bucket_os("os")).groupBy(
        "addon_id",
        "os").agg(F.countDistinct("client_id").alias("os_client_count")).join(
            client_counts, on="addon_id", how="left").withColumn(
                "os_pct",
                F.col("os_client_count") /
                F.col("total_clients")).groupby("addon_id").pivot("os").agg(
                    F.first("os_pct").alias("os_pct")))

    os_dist = os_dist.na.fill(0).select("addon_id",
                                        source_map(os_dist, "os_pct"))

    ct_dist = (addons_expanded.withColumn(
        "country",
        bucket_country("country")).groupBy("addon_id", "country").agg(
            F.countDistinct("client_id").alias("country_client_count")).join(
                client_counts, on="addon_id", how="left").withColumn(
                    "country_pct",
                    F.col("country_client_count") /
                    F.col("total_clients")).groupBy("addon_id").pivot(
                        "country").agg(
                            F.first("country_pct").alias("country_pct")))

    ct_dist = ct_dist.na.fill(0).select("addon_id",
                                        source_map(ct_dist, "country_pct"))

    combined_dist = os_dist.join(ct_dist, on="addon_id", how="outer")
    return combined_dist
Exemple #12
0
def calcIdf(corpus):
    # get document count
    docCount = corpusDf.agg(f.countDistinct('id'))
    docCount = docCount.collect()[0][0]

    # explode token vector
    corpusDfExplode = (corpusDf.select('id','tokens',(f.explode('tokens').alias('indvToken'))))

    # count number of IDs that include each word to get document frequency
    docFreqs = corpusDfExplode.groupBy('indvToken').agg(f.countDistinct('id').alias('df')) 
    docFreqs = docFreqs.sort(f.desc('df'))
    
    idfDf = docFreqs.withColumn('idf', docFreqs.df / docCount)
    idfDf = docFreqs.withColumn('idf', f.log((docCount + 1) / (docFreqs.df + 1)))

    idfRdd = idfDf.select('indvToken','idf').rdd 
    return idfRdd 
Exemple #13
0
    def test_all(self,
                 measure_columns=None,
                 dimension_columns=None,
                 max_num_levels=40):
        # CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"chisquareStats","info",display=False,weightKey="script")
        targetDimension = dimension_columns[0]
        all_dimensions = self._dimension_columns
        all_dimensions = [x for x in all_dimensions if x != targetDimension]
        # if self._analysisDict != {}:
        #     nColsToUse = self._analysisDict[self._analysisName]["noOfColumnsToUse"]
        # else:
        #     nColsToUse = None
        # if nColsToUse != None:
        #     all_dimensions = all_dimensions[:nColsToUse]
        all_measures = self._measure_columns
        df_chisquare_result = DFChiSquareResult()
        # print "df_chisquare_result"*50
        # print df_chisquare_result
        for d in all_dimensions:
            try:
                chisquare_result = self.test_dimension(targetDimension, d)
                df_chisquare_result.add_chisquare_result(
                    targetDimension, d, chisquare_result)
            except Exception as e:
                print(repr(e), d)
                continue
        for m in all_measures:
            try:
                if self._pandas_flag:
                    if len(self._data_frame[m].unique()
                           ) > self._analysisDict['Dimension vs. Dimension'][
                               'binSetting']['binCardinality']:
                        chisquare_result = self.test_measures(
                            targetDimension, m)
                        df_chisquare_result.add_chisquare_result(
                            targetDimension, m, chisquare_result)
                else:
                    if self._data_frame.select(F.countDistinct(m)).collect(
                    )[0][0] > self._analysisDict['Dimension vs. Dimension'][
                            'binSetting']['binCardinality']:
                        chisquare_result = self.test_measures(
                            targetDimension, m)
                        df_chisquare_result.add_chisquare_result(
                            targetDimension, m, chisquare_result)
            except Exception as e:
                print(str(e), m)
                continue

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "completion",
            "info",
            display=False,
            weightKey="script")
        return df_chisquare_result
def calc_retention(heavy_col_name, heavy_co_arr, heavy_co_date, anchor_start,
                   anchor_end):
    """
    Given a dataframe, heavy column name, heavy cutoff, and base week end date,
    returns dataframe with retention for heavy user type for each period.
    
    Params:
    heavy_col_name (string): name of the column for heavy user
    heavy_co_arr (array numbers): an array of cutoff values to define a heavy user for a date
    heavy_co_date (array dates): an array of dates for the cutoff values
    anchor_start (date): date for the start of the anchor week
    anchor_end (date): date for the end of the anchor week
    
    Returns dataframe with retention for heavy user type for each period
    """

    # Get heavy users
    for i in range(0, len(heavy_co_date)):
        if i == 0:
            hu_where = "(submission_date_s3 <= " + heavy_co_date[
                i] + " and " + heavy_col_name + " >= " + str(
                    heavy_co_arr[i]) + ")"
        else:
            hu_where = hu_where + " or (submission_date_s3 = " + heavy_co_date[
                i] + " and " + heavy_col_name + " >= " + str(
                    heavy_co_arr[i]) + ")"
    print hu_where

    ms_ret_hu = ms_ret.where(hu_where)
    # Get base week of heavy users
    base_hu = ms_ret_hu \
        .where("submission_date_s3 >= \'" +  anchor_start + "\' and submission_date_s3 <= \'" +  anchor_end + "\'") \
        .select('client_id').distinct()
    # Count unique clients in base week of heavy users
    base_num_clients = base_hu.select('client_id').distinct().count()
    # Get only base week heavy users for the entire time
    ret_hu = ms_ret_hu.join(base_hu, on='client_id')
    # Add period column
    ret_hu = ret_hu.withColumn("anchor", F.lit(base_start)) \
                   .withColumn("period", get_period("anchor", "submission_date_s3"))
    # Group by period and count the number of distinct clients
    hu_weekly_counts = ret_hu.groupby("period").agg(
        F.countDistinct("client_id").alias("n_week_clients"))
    # Add column with calculated retention and 95% CI
    hu_retention = (
        hu_weekly_counts.withColumn(
            "total_clients", F.lit(base_num_clients)).withColumn(
                "retention",
                F.col("n_week_clients") / F.col("total_clients"))
        # Add a 95% confidence interval based on the normal approximation for a binomial distribution,
        # p ± z * sqrt(p*(1-p)/n).
        # The 95% CI spans the range `retention ± ci_95_semi_interval`.
        .withColumn(
            "ci_95_semi_interval",
            F.lit(1.96) * F.sqrt(
                F.col("retention") *
                (F.lit(1) - F.col("retention")) / F.col("total_clients"))))
    return hu_retention
Exemple #15
0
def getColumnUniqueCounts(clickDF):
    """
    Parameters
    ---------- 
    clickDF : class Dataframe
    """ 
    distvals = clickDF.agg(*(countDistinct(col(c)).alias(c) for c in clickDF.columns))
    print(type(distvals))
    return distvals
Exemple #16
0
def _deduplication_todo(columns, df):
    """
    Returns what (columns, as in spark columns) to compute to get the results requested by
    the parameters.

    :param columns:
    :type columns: list
    :param df:
    :type df: DataFrame
    :return: Pyspark columns representing what to compute.
    """
    if columns is None:
        # 1 count distinct, on all columns
        todo = [countDistinct(*[col(c) for c in df.columns])]
    else:
        # multiple count distinct, one column each
        todo = [countDistinct(col(c)) for c in columns]
    return todo
Exemple #17
0
    def __init__(self):
        """
        Call the parent constructor.
        """
        super(FeatureUniqueUaTotal, self).__init__()

        self.group_by_aggs = {
            'distinct_ua': F.countDistinct(F.col('client_ua')).cast('float')
        }
Exemple #18
0
def count_unique_request(df_session):
    """
    Determine unique URL visits per session.
    To clarify, count a hit to a unique URL only once per session.
    """
    df_unique_url = df_session.groupby("ip_session_count").agg(
        countDistinct("request_url").alias("count_unique_requests"))
    df_unique_url.show()
    return df_unique_url
def calculate_concentration_ratios(editor_interactions):
    concentration_ratio = editor_interactions.groupby('page_id').agg(
        f.countDistinct('event_user_id').alias('num_editors'),
        f.sum('revisions_count').alias('num_revisions')) \
        .withColumn('concentration_ratio', col('num_editors') / col('num_revisions'))

    editor_interactions = editor_interactions.join(concentration_ratio,
                                                   on='page_id')
    return editor_interactions
Exemple #20
0
def cardinalities(df, cols):
    from functools import reduce

    counts = df.agg(
        F.struct(*[F.countDistinct(F.col(c)).alias(c) for c in cols] +
                 [F.count(F.col(cols[0])).alias('total')]).alias(
                     "results")).select("results").collect()[0][0].asDict()
    counts.update({'total': df.count()})
    return counts
Exemple #21
0
def check_df(df):
  print(
    "Number of sites %d" %
    df.agg(countDistinct(col("visit_id"))).collect()[0]
  )
  print(
    "Number of records with stripped cookies %d" %
    df.agg(count_not_null('original_cookies')).collect()[0]
  )
  if 'content_hash' in df.columns:
    print(
      "Total number of scripts loaded %d" %
      df.agg(count_not_null("content_hash")).collect()[0]
    )
    print(
      "Number of distinct scripts loaded %d" %
      df.agg(countDistinct(col("content_hash"))).collect()[0]
    )
Exemple #22
0
def get_top_addon_names(addons_expanded):
    w = Window().partitionBy("addon_id").orderBy(F.col("n").desc())
    cnts = addons_expanded.groupby("addon_id", "name").agg(
        F.countDistinct("client_id").alias("n"))
    addon_names = (cnts.withColumn(
        "rn",
        F.row_number().over(w)).where(F.col("rn") == 1).select(
            "addon_id", "name"))
    return addon_names
Exemple #23
0
def get_class_distribution(df, col_name='subbreddit_display_name'):
    count = df.groupBy(col_name).agg(F.countDistinct('_id'))
    s = count.select(F.sum('count(DISTINCT _id)')).collect()[0][0]

    udf_func = F.udf(lambda col: round(col / s, 4) * 100, FloatType())

    count = count.withColumn('set_part_%', udf_func('count(DISTINCT _id)'))

    return count.show()
Exemple #24
0
def foreach_batch_function(df, epoch_id):
    df.groupBy("country").count().orderBy("count", ascending=False) \
        .select(col("country").alias("maxCountry"), col("count")).limit(1).show()

    df.groupBy("country").count().orderBy("count", ascending=True) \
        .select(col("country").alias("minCountry"), col("count")).limit(1).show()

    df.agg(countDistinct(
        col("first_name")).alias("numberOfUniqueUsers")).show()
def get_addon(data,
              date,
              period=7,
              country_list=None):
    """ Calculate the proportion of WAU that have a "self installed" addon for a specific date

        Parameters:
            data: sample of the main server ping data frame
            date: string, with the format of 'yyyyMMdd'
            period: The number of days before looked at in the analysis
            country_list: a list of country names in string

        Returns:
            a dataframe showing all the information for each date in the period
              - three columns: 'submission_date_s3', 'country', 'pct_Addon'
    """

    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)

    addon_filter = (~F.col('addon.is_system')) & (~F.col('addon.foreign_install')) &\
                   (~F.col('addon.addon_id').isin(NON_MOZ_TP)) &\
                   (~F.col('addon.addon_id').like('%@mozilla%')) &\
                   (~F.col('addon.addon_id').like('%@shield.mozilla%')) &\
                   (~F.col('addon.addon_id').like('%' + UNIFIED_SEARCH_STR + '%'))

    WAU = data_all\
        .filter("submission_date_s3 <= '{0}' and submission_date_s3 > '{1}'".format(date, begin))\
        .groupBy('country')\
        .agg(F.countDistinct('client_id').alias('WAU'))

    addon_count = data_all\
        .filter("submission_date_s3 <= '{0}' and submission_date_s3 > '{1}'".format(date, begin))\
        .select('submission_date_s3', 'country', 'client_id',
                F.explode('active_addons').alias('addon'))\
        .filter(addon_filter)\
        .groupBy('country')\
        .agg(F.countDistinct('client_id').alias('add_on_count'))

    join_df = WAU.join(addon_count, 'country', how='left')\
        .withColumn("pct_addon", (100.0 * F.col("add_on_count") / F.col("WAU")))\
        .select(F.lit(date).alias('submission_date_s3'), '*')

    return join_df.select('submission_date_s3', 'country', 'pct_addon')
Exemple #26
0
def main(spark, interaction_path):
    '''
    ----------
    spark : SparkSession object
    interaction_path : string, path to the interaction parquet file to load
    user_map_path : string, path to the user map parquet file to load
    '''
    # Load the dataframe
    interaction = spark.read.parquet(interaction_path)
    #user_map = spark.read.parquet(user_map_path)

    # discard rating == 0
    interaction = interaction.filter(interaction['is_read'] == 1)
    #is_read_filtered = interaction.select("user_id").distinct().count()
    #print("number of distinct user ids after discarding is_read == 0: ", is_read_filtered)

    # Get dataframe with users having more than 30 interactions
    df = interaction.groupby('user_id').agg(countDistinct('book_id').alias('num_interaction'))
    df_more30 = df.where(df.num_interaction > 30)
    user_id_ls = df_more30.select('user_id').distinct()
    #print("number of distinct user ids after discarding interactions < 30: ", user_id_ls.count())

    # save filtered interaction data for possible later use
    print("save filtered data")
    df_more30.write.format("parquet").mode("overwrite").save("hdfs:/user/hj1399/filtered.parquet")

    # split 60%, 20%, 20% of downsample based on distinct user id
    seed=42
    train_user, val_user, test_user = df_more30\
        .select("user_id")\
        .distinct()\
        .randomSplit(weights=[0.6, 0.2, 0.2], seed=seed)

    #join splitted train_user, val_user, test_user with downsample to create training dataset
    # these are df now (!= rdd)
    train = train_user.join(df_more30, 'user_id', 'inner')
    val = val_user.join(df_more30, 'user_id', 'inner')
    test = test_user.join(df_more30, 'user_id', 'inner')

    # take 50 % of interactions per user from val and test data and add those to train
    print("start taking half inteactions from val and test data and add them to train")
    val_sampled, val_left = split_data(val, fraction=0.5, seed=42)
    test_sampled, test_left = split_data(test, fraction=0.5, seed=42)

    train_df = train.union(val_sampled).union(test_sampled)
    
    print("number of distinct user ids of train data: ", train_df.select("user_id").distinct().count())
    print("number of distinct user ids of val data: ", val_left.select("user_id").distinct().count())
    print("number of distinct user ids of test data: ", test_left.select("user_id").distinct().count())

    train_df.write.format("parquet").mode("overwrite").save("hdfs:/user/hj1399/train_total.parquet")
    print("train df saved")
    val_left.write.format("parquet").mode("overwrite").save("hdfs:/user/hj1399/val_total.parquet")
    print("val df saved")
    test_left.write.format("parquet").mode("overwrite").save("hdfs:/user/hj1399/test_total.parquet")
    print("test df saved")
Exemple #27
0
def install_flow_events(events):
    def source_map(df, alias, extra_filter=""):
        m = F.create_map(
            list(
                chain(*((F.lit(name.split("_")[0]), F.col(name))
                        for name in df.columns
                        if name != "addon_id" and extra_filter in name
                        )))).alias(alias)
        return m

    install_flow_events = (
        events.select([
            "client_id",
            "submission_date_s3",
            "event_map_values.method",
            "event_method",
            "event_string_value",
            "event_map_values.source",
            "event_map_values.download_time",
            "event_map_values.addon_id",
        ]).filter("event_category = 'addonsManager'").filter(
            "event_object = 'extension'").filter("""
        (event_method = 'install' and event_map_values.step = 'download_completed') or 
        (event_method = 'uninstall')
        """).withColumn(
                "addon_id",
                F.when(
                    F.col("addon_id").isNull(),
                    F.col("event_string_value")).otherwise(F.col("addon_id")),
            )  # uninstalls populate addon_id in a different place
        .drop("event_string_value").groupby(
            "addon_id", "event_method", "source").agg(
                F.avg("download_time").alias("avg_download_time"),
                F.countDistinct("client_id").alias("n_distinct_users"),
            ))

    installs = (install_flow_events.filter("event_method = 'install'").groupBy(
        "addon_id").pivot("source").agg(
            (F.sum("n_distinct_users") * F.lit(100)),
            F.avg("avg_download_time")))
    uninstalls = (install_flow_events.filter(
        "event_method = 'uninstall'").groupBy("addon_id").pivot("source").agg(
            (F.sum("n_distinct_users") * F.lit(100))))

    agg = (installs.na.fill(0).select(
        "addon_id",
        source_map(installs, "installs", "n_distinct_users"),
        source_map(installs, "download_times", "avg_download_time"),
    ).join(
        uninstalls.na.fill(0).select("addon_id",
                                     source_map(uninstalls, "uninstalls")),
        on="addon_id",
        how="full",
    ))

    return agg
Exemple #28
0
def registry_etl(
    spark: SparkSession,
    config: confuse.core.Configuration,
    icd_to_desc_map: pd.DataFrame,
) -> pd.DataFrame:

    registry_claims_bucket = config["buckets"]["registry_claims"].get(str)
    log.info(f"Registry claim bucket: {registry_claims_bucket}")

    log.info(f"Reading in registry claims data")
    registry_rdd = spark.read.parquet(
        registry_claims_bucket.replace("s3:", "s3a:")).withColumnRenamed(
            "patient_id", "registry_id")

    # Select claims around patient reference date and filter out claims before 2017 to remove ICD9 codes
    registry_rdd = (
        registry_rdd.where(  # Filters to claims falling before reference date
            F.col("claim_date") <
            F.date_sub(F.col("reference_date"), 0)).where(
                F.col("claim_date") >
                F.date_sub(F.col("reference_date"),
                           1 * 365))  # TODO [Low] Move time length into YAML
        .where(F.col("reference_date") >
               F.lit("2017-01-01"))  # TODO [Low] Move cut-off date into YAML
    )
    registry_id_count = registry_rdd.select("registry_id").distinct().count()
    log.info(f"Registry ID Count: {registry_id_count}")

    registry_count_min = config["etl"]["registry_count_min"].get(int)
    log.info(f"ETL of registry data and bringing to pandas")
    registry_df = (registry_rdd.select(
        "registry_id",
        F.explode(F.col("dx_list")).alias("code")).where(
            F.col("code").isNotNull()).groupBy("code").agg(
                F.collect_set(F.col("registry_id")).alias("registry_ids"),
                F.countDistinct(F.col("registry_id")).alias("registry_count"),
            ).where(F.col("registry_count") > registry_count_min).withColumn(
                "registry_rate",
                F.col("registry_count") /
                F.lit(registry_id_count))).toPandas()

    # TODO [Medium] Move below this into Spark
    log.info(f"Registry ETL pandas operations")
    registry_df.drop(
        registry_df.index[~registry_df.code.isin(icd_to_desc_map.code)],
        inplace=True)
    registry_df.sort_values("code").reset_index(drop=True, inplace=True)
    n_total_test = int(
        round(
            registry_df["registry_count"].iloc[0] /
            registry_df["registry_rate"].iloc[0],
            0,
        ))
    log.info(f"N Total Test: {n_total_test}")
    registry_df["n_total_test"] = n_total_test
    return registry_df
Exemple #29
0
def edges_agg(context, df: DataFrame):
    cols_gb = context.solid_config.get("cols_gb")
    col_date = context.solid_config.get("col_date")
    min_edges = context.solid_config.get("min_edges")
    node_att = context.solid_config.get("node_att")
    weight_distinct_col = context.solid_config.get("weight_distinct_col")

    context.log.debug(
        f"edges_agg: cols_gb={cols_gb}, col_date={col_date}, min_edges={min_edges}"
    )

    edge_weight_col = "edge_weight"
    filter_query = f"{edge_weight_col}>={min_edges}"

    node_att_x = []
    node_att_y = []

    if node_att:
        node_att_x = [f"{c}1" for c in node_att]
        node_att_y = [f"{c}2" for c in node_att]

    collect_set_list = list(
        set(df.columns) - set(cols_gb) - set(node_att_x + node_att_y))

    agg_list = [
        F.countDistinct(F.col(weight_distinct_col)).alias(edge_weight_col),
    ] + [
        F.collect_set(F.col(c).cast("string")).cast("string").alias(f"{c}_att")
        for c in collect_set_list
    ]
    if col_date:
        agg_list += [
            F.min(F.col(col_date).cast("string")).alias(f"{col_date}_min"),
            F.max(F.col(col_date).cast("string")).alias(f"{col_date}_max"),
        ]

    df_edges = df.groupBy(cols_gb).agg(*agg_list).filter(filter_query)

    node_att_dict = {k: v for k, v in zip(cols_gb, [node_att_x, node_att_y])}
    df_nodes = df.select(*cols_gb + node_att_x + node_att_y)
    df_nodes = create_df_node(df_nodes, node_att_dict)
    df_edges_nodes = create_df_node(df_edges, {k: None for k in cols_gb})
    df_nodes = df_nodes.join(
        df_edges_nodes, on="NodeID",
        how="right")  # keep only nodes that are in df_edges

    df_edges = df_edges.cache()
    df_nodes = df_nodes.cache()

    context.log.info(
        f"df_edges: count={df_edges.count}, columns={df_edges.columns}")
    context.log.info(
        f"df_nodes: count={df_nodes.count}, columns={df_nodes.columns}")

    yield Output(df_edges, "df_edges")
    yield Output(df_nodes, "df_nodes")
    def drop_constant_unique_cols(self):
        if not self._pandas_flag:
            cnt = self.data_frame.agg(
                *(F.countDistinct(c).alias(c)
                  for c in self.data_frame.columns)).first()
            self.data_frame = self.data_frame.drop(*[
                c for c in cnt.asDict()
                if cnt[c] == 1 or cnt[c] == self.data_frame.count()
            ])
        else:
            # self.data_frame  = self.data_frame.loc[:, (self.data_frame .nunique() != 1)]
            # new_df  = self.data_frame.loc[:, (self.data_frame.nunique() != self.data_frame .shape[0])]
            # if self.target not in new_df.columns:
            #     new_df[self.target] = self.data_frame[self.target]
            # self.data_frame = new_df
            self.data_frame = self.data_frame.apply(
                lambda col: pd.to_datetime(col, errors='ignore')
                if col.dtypes == object else col,
                axis=0)
            self.data_frame = self.data_frame.loc[:, (
                self.data_frame.nunique() != 1)]
            float_df = self.data_frame.loc[:, (
                self.data_frame.dtypes == 'float64')]
            date_df = self.data_frame[self.datetime_cols]
            int_df = self.data_frame.loc[:,
                                         (self.data_frame.dtypes == 'int64')]
            other_df = self.data_frame.loc[:, (
                (self.data_frame.dtypes == 'object') &
                (self.data_frame.nunique() != self.data_frame.shape[0]))]
            new_df = pd.concat([other_df, float_df, int_df, date_df], axis=1)
            #new_df = new_df.loc[:, (new_df.nunique() != new_df.shape[0])]
            #new_df  = self.data_frame .loc[:, (self.data_frame.nunique() != self.data_frame .shape[0])]
            #new_df=self.data_frame
            if self.target not in new_df.columns:
                new_df[self.target] = self.data_frame[self.target]
                self.data_frame = new_df
            else:
                self.data_frame = new_df
        self.data_change_dict['dropped_columns_list'] = []

        def dropped_col_update(list_element):
            if list_element["column_name"] not in self.data_frame.columns:
                list_element["dropped_column_flag"] = True
                self.data_change_dict['dropped_columns_list'].append(
                    list_element["column_name"])
                self.numeric_cols, self.dimension_cols, self.col_with_nulls = drop_cols_from_list(
                    list_element["column_name"], self.numeric_cols,
                    self.dimension_cols, self.col_with_nulls)
            else:
                pass
            return list_element

        self.data_change_dict['Column_settings'] = [
            dropped_col_update(list_element)
            for list_element in self.data_change_dict['Column_settings']
        ]
Exemple #31
0
def os_on_date(data, date, period=7, country_list=None):
    """ Gets the distribution of OS usage calculated on the WAU on 1 day.

        Parameters:
        data: Usually the main summary data frame.
        date: day to get the os distribution for the past week.
        period: The number of days to calculate the distibution. By default it finds os
                distribution over a week.
        country_list: The countries to do the analysis. If None then it does it for the whole
                      world.

        Returns:
            submission_date_s3, country, os, pct_on_os
       """

    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)
    data_all = data_all.select(
        'client_id', 'submission_date_s3', 'country',
        nice_os(col('os'), col('os_version')).alias('nice_os'))

    # Calculate the WAU
    wau = data_all\
        .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\
        .groupBy('country')\
        .agg(countDistinct('client_id').alias('WAU'))\

    os_wau = data_all\
        .filter((col('submission_date_s3') <= date) &
                (col('submission_date_s3') > begin))\
        .groupBy('country', 'nice_os')\
        .agg(countDistinct('client_id').alias('WAU_on_OS'))\
        .select(lit(begin).alias('start_date'), lit(date).alias('submission_date_s3'),
                'country', 'WAU_on_OS', 'nice_os')

    res = os_wau.join(wau, 'country', how='left')\
                .select('start_date', 'submission_date_s3',
                        'country', 'WAU_on_OS', 'nice_os', 'WAU')

    return res.select('submission_date_s3', 'country',
                      col('nice_os').alias('os'),
                      (100.0 * col('WAU_on_OS') /
                       col('WAU')).alias('pct_on_os'))
Exemple #32
0
def main(sc):
    cdetEnclosureData = sc.textFile("file:///scratch/CDETS_Enclosures.txt")
    cdetEnclosureDataClean = cdetEnclosureData.map(clean_row)
    dataDict = cdetEnclosureDataClean.map(make_row)
    cdetEnclosureDataCleanDF = sqlContext.createDataFrame(dataDict)
    cdetEnclosureDataCleanDF.groupBy("CdetID").count().show()
    cdetEnclosureDataCleanDF.agg(countDistinct("CdetID")).show()
    
    #Call the dumpcr command
    cdetEnclosureData.foreach(get_enclosure_content)
Exemple #33
0
    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions
        self.assertEqual((0, u'99'),
                         tuple(g.agg(functions.first(df.key), functions.last(df.value)).first()))
        self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0])
        self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
def process_unique_columns(raw_df, process_properties, data_dict, stats_collector, acc_dict):

    if process_properties.get('unique_columns', None):

        for index in process_properties['unique_columns']:
            unique_count = (raw_df
                            .groupBy()
                            .agg(countDistinct(data_dict[index]['name']).alias('DISTINCT_COUNT'))
                            .collect())

            stats_collector['unique_' + data_dict[index]['name']] = unique_count[0]['DISTINCT_COUNT']

            print stats_collector['unique_' + data_dict[index]['name']]
Exemple #35
0
def runAggregateFunctions(spark, df1, df2):
    # collect_list, collect_set
    doubledDf1 = df1.union(df1)
    doubledDf1.select(functions.collect_list(doubledDf1["name"])).show(truncate=False)
    doubledDf1.select(functions.collect_set(doubledDf1["name"])).show(truncate=False)

    # count, countDistinct
    doubledDf1.select(functions.count(doubledDf1["name"]), functions.countDistinct(doubledDf1["name"])).show(
        truncate=False)

    # sum
    df2.printSchema()
    df2.select(sum(df2["price"])).show(truncate=False)

    # grouping, grouping_id
    df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping(df2["store"])).show(truncate=False)
    df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping_id(df2["store"], df2["product"])).show(
        truncate=False)
def process_groupby_unique_columns(raw_df, process_properties, data_dict, stats_collector, acc_dict):
    if process_properties.get('groupby_unique_columns', None):

        for item in process_properties['groupby_unique_columns']:
            groupby_column_name = data_dict[item[0]]['name']
            count_unique_column_name = data_dict[item[1]]['name']

            unique_count = (raw_df
                            .groupBy(groupby_column_name)
                            .agg(countDistinct(count_unique_column_name).alias('DISTINCT_COUNT'))
                            .collect())

            key_name = 'groupby_' + groupby_column_name + '_unique_' + count_unique_column_name
            stats_collector[key_name] = dict()

            for row in unique_count:
                if row[groupby_column_name] not in stats_collector[key_name]:
                    stats_collector[key_name][row[groupby_column_name]] = row['DISTINCT_COUNT']

            print stats_collector[key_name]
    def describe_1d(df, column, nrows, lookup_config=None):
        column_type = df.select(column).dtypes[0][1]
        # TODO: think about implementing analysis for complex
        # data types:
        if ("array" in column_type) or ("stuct" in column_type) or ("map" in column_type):
            raise NotImplementedError("Column {c} is of type {t} and cannot be analyzed".format(c=column, t=column_type))

        distinct_count = df.select(column).agg(countDistinct(col(column)).alias("distinct_count")).toPandas()
        non_nan_count = df.select(column).na.drop().select(count(col(column)).alias("count")).toPandas()
        results_data = pd.concat([distinct_count, non_nan_count],axis=1)
        results_data["p_unique"] = results_data["distinct_count"] / float(results_data["count"])
        results_data["is_unique"] = results_data["distinct_count"] == nrows
        results_data["n_missing"] = nrows - results_data["count"]
        results_data["p_missing"] = results_data["n_missing"] / float(nrows)
        results_data["p_infinite"] = 0
        results_data["n_infinite"] = 0
        result = results_data.ix[0].copy()
        result["memorysize"] = 0
        result.name = column

        if result["distinct_count"] <= 1:
            result = result.append(describe_constant_1d(df, column))
        elif column_type in {"tinyint", "smallint", "int", "bigint"}:
            result = result.append(describe_integer_1d(df, column, result, nrows))
        elif column_type in {"float", "double", "decimal"}:
            result = result.append(describe_float_1d(df, column, result, nrows))
        elif column_type in {"date", "timestamp"}:
            result = result.append(describe_date_1d(df, column))
        elif result["is_unique"] == True:
            result = result.append(describe_unique_1d(df, column))
        else:
            result = result.append(describe_categorical_1d(df, column))
            # Fix to also count MISSING value in the distict_count field:
            if result["n_missing"] > 0:
                result["distinct_count"] = result["distinct_count"] + 1

        # TODO: check whether it is worth it to
        # implement the "real" mode:
        if (result["count"] > result["distinct_count"] > 1):
            try:
                result["mode"] = result["top"]
            except KeyError:
                result["mode"] = 0
        else:
            try:
                result["mode"] = result["value_counts"].index[0]
            except KeyError:
                result["mode"] = 0
            # If and IndexError happens,
            # it is because all column are NULLs:
            except IndexError:
                result["mode"] = "MISSING"

        if lookup_config:
            lookup_object = lookup_config['object']
            col_name_in_db = lookup_config['col_name_in_db'] if 'col_name_in_db' in lookup_config else None
            try:
                matched, unmatched = lookup_object.lookup(df.select(column), col_name_in_db)
                result['lookedup_values'] = str(matched.count()) + "/" + str(df.select(column).count())
            except:
                result['lookedup_values'] = 'FAILED'
        else:
            result['lookedup_values'] = ''

        return result
def main(input_path, output_path):

    sc = SparkContext(appName='Data_Analysis')

    '''
    Define spark accumulators for counting total records,
    valid records and empty string checking
    '''
    empty_records = {}  # accumulator check the emptiness of columns
    for column in CHECK_EMPTY_COLUMNS:
        empty_records[column] = sc.accumulator(0)

    total_records = sc.accumulator(0)
    valid_records = sc.accumulator(0)

    # collection all desired statistics
    stats_collector = {}

    # load raw dataset
    raw_rdd = sc.textFile(input_path).map(lambda x: x.split('|'))
    # vaidate whether data fulfill the definition of data dictionary
    validate_rdd = raw_rdd.filter(lambda x: data_validate(x, VALIDATION_LIST, total_records, valid_records))

    '''
    Keep only the following columns: 'IMSI', 'EVENT_TYPE', 'CGI', 'DATETIME', 'BBC'
    '''
    # load fixed cell master file for cgi and bbc mapping
    cell_master_dict = {}
    with open(CELL_MASTER_FILE, 'r') as f:
        for line in f:
            line = line.strip()
            line = line.split('|')
            cell_master_dict[line[0]] = line[9]

    transform_rdd = validate_rdd.map(lambda x: data_tranform(x, cell_master_dict, empty_records))

    '''
    Filter out records for JABODETABEK
    '''
    Jabodetabek_rdd = transform_rdd.filter(lambda x: x[-1] == 'JABODETABEK').cache()
    stats_collector['Jabodetabek_records'] = Jabodetabek_rdd.count()
    print 'Number of Jabodetabek records: %d' % stats_collector['Jabodetabek_records']

    stats_collector['total_records'] = total_records.value
    print 'Number of total records: %d' % stats_collector['total_records']

    stats_collector['valid_records'] = valid_records.value
    print 'Number of valid records: %d' % stats_collector['valid_records']

    stats_collector['empty_records'] = {}
    if stats_collector['valid_records'] != 0:
        for column in CHECK_EMPTY_COLUMNS:
            stats_collector['empty_records'][column] = empty_records[column].value
            print 'Empty record in column %s is %d, with percentage %.2f' % (column, empty_records[column].value,
                                                                             empty_records[column].value / float(
                                                                                 stats_collector['valid_records']))

    if stats_collector['Jabodetabek_records'] > 0:

        '''
        Generate the aggregate count distribution over time
        '''
        # aggregate count distribution over time per event type
        event_datetime_pair = Jabodetabek_rdd.map(lambda x: ((x[1], round_datetime(x[3], 10, 'minutes')), 1))
        agg_event_datetime_pair = event_datetime_pair.reduceByKey(lambda x, y: x + y).sortByKey(ascending=True)
        stats_collector['event_time_distribution'] = {}
        for item in agg_event_datetime_pair.collect():
            if item[0][0] not in stats_collector['event_time_distribution']:
                stats_collector['event_time_distribution'][item[0][0]] = [[item[0][1], item[1]]]
            else:
                stats_collector['event_time_distribution'][item[0][0]].append([item[0][1], item[1]])
        print 'Event time records distribution: %s' % str(stats_collector['event_time_distribution'])

        # overall aggregate count distribution over time
        event_time_distribution = stats_collector['event_time_distribution']
        event_list = event_time_distribution.keys()

        time_series_dict = dict()
        for event_type in event_list:
            for item in event_time_distribution[event_type]:
                if item[0] not in time_series_dict:
                    time_series_dict[item[0]] = item[1]
                else:
                    time_series_dict[item[0]] += item[1]

        time_distribution = [[t, time_series_dict[t]] for t in time_series_dict]
        time_distribution = sorted(time_distribution, key=lambda x: x[0])

        stats_collector['time_distribution'] = time_distribution
        print 'Time records distribution: %s' % str(stats_collector['time_distribution'])

        # Group by records according different event
        stats_collector['event_distribution'] = [[e_type, sum([event_time_distribution[e_type][i][1] for i in
                                                               range(len(event_time_distribution[e_type]))])]
                                                 for e_type in event_list]
        print 'Event distribution is: %s' % stats_collector['event_distribution']


        '''
        Convert RDD to dataframe
        '''
        sql_context = SQLContext(sc)
        fields = [StructField(field_name, StringType(), True) for field_name in SCHEMA_NAMES]
        schema = StructType(fields)
        df = sql_context.createDataFrame(Jabodetabek_rdd, schema).cache()

        '''
        Unique IMIS per hour
        '''
        udf_round_datetime = udf(round_datetime, StringType())
        df = df.select(df['*'], udf_round_datetime(df['DATETIME']).alias('Hour'))
        stats_collector['imsi_per_hour'] = (df.groupBy('Hour').agg(countDistinct('IMSI').alias('UNIQUE_IMSI'))
                                            .rdd.map(lambda x: [x['Hour'], x['UNIQUE_IMSI']]).collect())

        '''
        Number of records per imsi distribution
        '''
        count_by_imsi = df.groupBy('IMSI').count().selectExpr('count as num_records') # ['IMSI', 'num_records']
        stats_collector['total_imsi'] = count_by_imsi.count()
        print 'Total number of imsi is %d' % stats_collector['total_imsi']

        stats = count_by_imsi.selectExpr('avg(num_records) as mean', 'stddev(num_records) as std').collect()
        mean_value = stats[0]['mean']
        std_value = stats[0]['std']
        upper_limit = mean_value + 3 * std_value # mean + 3 * standard deviation

        count_by_imsi_filtered = count_by_imsi.filter(count_by_imsi['num_records'] <= upper_limit)
        agg_by_count = count_by_imsi_filtered.groupBy('num_records').count()

        stats_collector['imsi_distribution'] = agg_by_count.rdd.map(lambda x: [x['num_records'], x['count']]).collect()
        print 'IMSI distribution is: %s' % stats_collector['imsi_distribution']

        output_rdd = sc.parallelize([json.dumps(stats_collector)])
        output_rdd.saveAsTextFile(output_path)
    else:
        output_rdd = sc.parallelize([json.dumps(stats_collector)])
        output_rdd.saveAsTextFile(output_path)
        raise ValueError('No record for Jabodetabek Found.')
Exemple #39
0
# COMMAND ----------

filterNonNullDF = nonNullDF.filter((col("firstName") == "") | (col("lastName") == "")).sort(asc("email"))
display(filterNonNullDF)

# COMMAND ----------

# MAGIC %md
# MAGIC **Example aggregations using ``agg()`` and ``countDistinct()``.**

# COMMAND ----------

from pyspark.sql.functions import countDistinct


countDistinctDF = nonNullDF.select("firstName", "lastName").groupBy("firstName", "lastName").agg(countDistinct("firstName")) 
  
display(countDistinctDF)

# COMMAND ----------

# MAGIC %md 
# MAGIC **Compare the DataFrame and SQL Query Physical Plans**
# MAGIC **(Hint:  They should be the same.)**

# COMMAND ----------

countDistinctDF.explain()

# COMMAND ----------
def main_process(raw_rdd, sc, output_path, data_dict, process_properties):

    sql_context = SQLContext(sc)

    stats_collector = dict()
    stats_collector['data_dict'] = data_dict
    stats_collector['process_properties'] = process_properties

    # initialize accumulator
    acc_dict = initial_accumulator_dict(sc, data_dict)

    # data validation process
    valid_rdd = raw_rdd.filter(lambda x: data_validation(x, data_dict, acc_dict))

    if process_properties.get('time_distribution', None):
        if 'timestamp_format' not in process_properties:
            raise Exception(
                'Time distribution cannot be derived without timestamp information specified in properties.')

        frequency = process_properties['time_distribution'].get('frequency', 10)
        frequency_unit = process_properties['time_distribution'].get('unit', 'minutes')

        event_datetime_pair = valid_rdd.map(lambda x: datetime_distribution_pair(x, process_properties,
                                                                                 data_dict, frequency, frequency_unit))

        agg_event_datetime_result = (event_datetime_pair.reduceByKey(lambda x, y: x + y)
                                     .sortByKey(ascending=True)
                                     .collect())
        process_agg_event_datetime_result(agg_event_datetime_result, process_properties, stats_collector, data_dict)

        store_accumulator_dict(acc_dict, stats_collector)

    if process_properties.get('time_gap_distribution', None):
        if 'timestamp_format' not in process_properties:
            raise Exception(
                'Time distribution cannot be derived without timestamp information specified in properties.')

        column_number = process_properties['time_gap_distribution']['index']
        return_range = process_properties['time_gap_distribution'].get('return_std_range', 3)
        stats_method = process_properties['time_gap_distribution'].get('stats_method', 'median')
        resolution = process_properties['time_gap_distribution'].get('resolution', '60')
        threshold = process_properties['time_gap_distribution'].get('threshold', 3600*24)

        column_name = data_dict[column_number]['name']
        stats_key = 'gap_distribution_' + column_name
        stats_collector[stats_key] = {}

        agg_result = (valid_rdd.map(lambda x: (x[column_number],
                                               construct_datetime(x, process_properties['timestamp_format'],
                                                                  data_dict)))
                      .groupByKey()
                      .map(lambda x: (x[0], generate_gap_stats(x[1], stats_method, resolution, threshold)))
                      .filter(lambda x: (x[1] != 0) and (x[1] != None))
                      .map(lambda x: (x[1], 1))
                      .reduceByKey(lambda x, y: x + y)
                      .collect())

        mean_value = hist_mean(agg_result)
        std_value = hist_std(agg_result)
        median_value = hist_median(agg_result)

        stats_collector[stats_key]['mean'] = mean_value
        stats_collector[stats_key]['std'] = std_value
        stats_collector[stats_key]['median'] = median_value

        upper_limit = mean_value + return_range * std_value  # mean + return_range * standard deviation
        stats_collector[stats_key]['distribution'] = [[item[0], item[1]] for item in agg_result if
                                                      item[0] <= upper_limit]

        store_accumulator_dict(acc_dict, stats_collector)

    # Test
    if process_properties.get('hourly_time_gap_distribution', None):
        if 'timestamp_format' not in process_properties:
            raise Exception(
                'Time distribution cannot be derived without timestamp information specified in properties.')

        def hour_gap_pair_list(spark_iterable, resolution, threshold):

            assert isinstance(resolution,
                              int) and resolution > 0, 'Please provide proper resolution in terms of seconds.'

            if len(spark_iterable) > 1:

                dt_list = [datetime.datetime.strptime(dt, '%Y-%m-%d %H:%M:%S') for dt in spark_iterable]
                dt_list = sorted(dt_list)

                gap_list = []
                for i in range(1, len(dt_list)):
                    gap = (dt_list[i] - dt_list[i - 1]).seconds
                    if gap < threshold:
                        hour = round_datetime(datetime.datetime.strftime(dt_list[i - 1], '%Y-%m-%d %H:%M:%S'),
                                              sample_f=1, unit='hours')
                        gap_list.append(((hour, gap // resolution), 1))

                if len(gap_list) >= 1:
                    return gap_list
                else:
                    return [((0, 0), 1)]

            else:
                return [((0, 0), 1)]

        column_number = process_properties['hourly_time_gap_distribution']['index']
        return_range = process_properties['hourly_time_gap_distribution'].get('return_std_range', 3)
        resolution = process_properties['hourly_time_gap_distribution'].get('resolution', 60)
        threshold = process_properties['hourly_time_gap_distribution'].get('threshold', 3600)

        column_name = data_dict[column_number]['name']
        stats_key = 'hourly_gap_distribution_' + column_name
        stats_collector[stats_key] = dict()

        agg_result = (valid_rdd.map(lambda x: (x[column_number],
                                               construct_datetime(x, process_properties['timestamp_format'],
                                                                  data_dict)))
                      .groupByKey()
                      .flatMap(lambda x: hour_gap_pair_list(x[1], resolution, threshold))
                      .filter(lambda x: (x[0][1] != None) and (x[0][1] != 0))
                      .reduceByKey(lambda x, y: x + y)
                      .collect())

        hourly_agg_result = dict()

        for item in agg_result:
            if item[0][0] not in hourly_agg_result:
                hourly_agg_result[item[0][0]] = {item[0][1]: item[1]}
            else:
                if item[0][1] not in hourly_agg_result[item[0][0]]:
                    hourly_agg_result[item[0][0]][item[0][1]] = item[1]
                else:
                    hourly_agg_result[item[0][0]][item[0][1]] += item[1]

        for hour in hourly_agg_result:
            stats_collector[stats_key][hour] = dict()
            agg_result = [[item[0], item[1]] for item in hourly_agg_result[hour].items()]

            mean_value = hist_mean(agg_result)
            std_value = hist_std(agg_result)
            median_value = hist_median(agg_result)

            stats_collector[stats_key][hour]['mean'] = mean_value
            stats_collector[stats_key][hour]['std'] = std_value
            stats_collector[stats_key][hour]['median'] = median_value

            upper_limit = mean_value + return_range * std_value  # mean + return_range * standard deviation
            stats_collector[stats_key][hour]['distribution'] = [[item[0], item[1]] for item in agg_result if
                                                                item[0] <= upper_limit]

    if process_properties.get('unique_time_distribution', None):
        column_number_list = process_properties['unique_time_distribution']['columns']

        for column_number in column_number_list:
            column_name = data_dict[column_number]['name']

            stats_key = 'unique_time_distribution_' + column_name

            frequency = process_properties['unique_time_distribution'].get('frequency', 1)
            frequency_unit = process_properties['unique_time_distribution'].get('unit', 'hours')

            gap_rdd = (valid_rdd.map(lambda x: (x[column_number],
                                                round_datetime(
                                                    construct_datetime(x, process_properties['timestamp_format'],
                                                                       data_dict)
                                                    , frequency, frequency_unit))))

            gap_df = gap_rdd.toDF().selectExpr('_2 as DATETIME', '_1 as COLUMN')

            stats_collector[stats_key] = (gap_df.groupBy('DATETIME').agg(countDistinct('COLUMN').alias('UNIQUE_COLUMN'))
                                          .rdd.map(lambda x: [x['DATETIME'], x['UNIQUE_COLUMN']]).collect())

        store_accumulator_dict(acc_dict, stats_collector)

    if process_properties.get('unique_columns', None):
        if 'raw_df' not in locals():
            raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache()

        for index in process_properties['unique_columns']:
            unique_count = (raw_df.groupBy()
                            .agg(countDistinct(data_dict[index]['name'])
                                 .alias('DISTINCT_COUNT'))
                            .collect())
            stats_collector['unique_' + data_dict[index]['name']] = unique_count[0]['DISTINCT_COUNT']

        store_accumulator_dict(acc_dict, stats_collector)

    if process_properties.get('groupby_unique_columns', None):
        if 'raw_df' not in locals():
            raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache()

        for item in process_properties['groupby_unique_columns']:
            groupby_column = item[0]
            groupby_column_name = data_dict[item[0]]['name']
            count_unique_column = item[1]
            count_unique_column_name = data_dict[item[1]]['name']

            unique_count = (raw_df
                            .groupBy(groupby_column_name)
                            .agg(countDistinct(count_unique_column_name).alias('DISTINCT_COUNT'))
                            .collect())

            key_name = 'groupby_' + groupby_column_name + '_unique_' + count_unique_column_name
            stats_collector[key_name] = dict()

            for row in unique_count:
                if row[groupby_column_name] not in stats_collector[key_name]:
                    stats_collector[key_name][row[groupby_column_name]] = row['DISTINCT_COUNT']

        store_accumulator_dict(acc_dict, stats_collector)

    if process_properties.get('groupby_columns', None):
        if 'raw_df' not in locals():
            raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache()

        groupby_column_index = process_properties['groupby_columns']
        groupby_column_names = [data_dict[index]['name']
                                for index in groupby_column_index]

        groupby_stats = raw_df.groupBy(groupby_column_names).count().collect()

        for column in groupby_column_names:
            stats_collector['groupby_' + column] = dict()

        for row in groupby_stats:
            for column in groupby_column_names:
                stats_collector['groupby_' + column][row[column]] = \
                    stats_collector['groupby_' + column].get(row[column], 0) + row['count']

        store_accumulator_dict(acc_dict, stats_collector)

    if process_properties.get('transaction_distribution', None):
        if 'raw_df' not in locals():
            raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache()

        column_number = process_properties['transaction_distribution']['index']
        column_name = data_dict[column_number]['name']
        return_range = process_properties['transaction_distribution']['return_std_range']
        stats_key = 'records_distribution_' + column_name
        stats_collector[stats_key] = {}

        agg_by_count = (raw_df
                        .groupBy(column_name).count()
                        .selectExpr('count as num_records')
                        .groupBy('num_records')
                        .count())
        agg_result = agg_by_count.rdd.map(lambda x: [x['num_records'], x['count']]).collect()

        mean_value = hist_mean(agg_result)
        std_value = hist_std(agg_result)
        median_value = hist_median(agg_result)

        stats_collector[stats_key]['mean'] = mean_value
        stats_collector[stats_key]['std'] = std_value
        stats_collector[stats_key]['median'] = median_value

        upper_limit = mean_value + return_range * std_value  # mean + return_range * standard deviation
        stats_collector[stats_key]['distribution'] = [[item[0], item[1]] for item in agg_result if
                                                      item[0] <= upper_limit]

        store_accumulator_dict(acc_dict, stats_collector)

    output_rdd = sc.parallelize([json.dumps(stats_collector)])
    output_rdd.saveAsTextFile(output_path)
def main_process(raw_rdd, sc, output_path, data_dict, process_properties):

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------Initialization----------------------------------------------------------------------------
    # initialize sql context
    sql_context = SQLContext(sc)

    stats_collector = dict()
    stats_collector['data_dict'] = data_dict
    stats_collector['process_properties'] = process_properties

    # initialize accumulator
    acc_dict = initial_accumulator_dict(sc, data_dict)

    # data validation process
    valid_rdd = raw_rdd.filter(lambda x: data_validation(x, data_dict, acc_dict))

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------time distribution of number of records----------------------------------------------------
    if process_properties.get('time_distribution', None):
        if 'timestamp_format' not in process_properties:
            raise Exception(
                'Time distribution cannot be derived without timestamp information specified in properties.')

        frequency = process_properties['time_distribution'].get('frequency', 10)
        frequency_unit = process_properties['time_distribution'].get('unit', 'minutes')

        event_datetime_pair = valid_rdd.map(lambda x: datetime_distribution_pair(x, process_properties,
                                                                                 data_dict, frequency, frequency_unit))

        agg_event_datetime_result = (event_datetime_pair.reduceByKey(lambda x, y: x + y)
                                     .sortByKey(ascending=True)
                                     .collect())
        process_agg_event_datetime_result(agg_event_datetime_result, process_properties, stats_collector, data_dict)

        store_accumulator_dict(acc_dict, stats_collector)

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------distribution of median (mean) of time interval per imsi ----------------------------------
    if process_properties.get('time_gap_distribution', None):
        if 'timestamp_format' not in process_properties:
            raise Exception(
                'Time distribution cannot be derived without timestamp information specified in properties.')

        column_number = process_properties['time_gap_distribution']['index']
        return_range = process_properties['time_gap_distribution'].get('return_std_range', 3)
        stats_method = process_properties['time_gap_distribution'].get('stats_method', 'median')
        resolution = process_properties['time_gap_distribution'].get('resolution', '60')
        threshold = process_properties['time_gap_distribution'].get('threshold', 3600*24)

        column_name = data_dict[column_number]['name']
        stats_key = 'gap_distribution_' + column_name
        stats_collector[stats_key] = {}

        agg_result = (valid_rdd.map(lambda x: (x[column_number],
                                               construct_datetime(x, process_properties['timestamp_format'],
                                                                  data_dict)))
                      .groupByKey()
                      .map(lambda x: (x[0], generate_gap_stats(x[1], stats_method, resolution, threshold)))
                      .filter(lambda x: (x[1] != 0) and (x[1] != None))
                      .map(lambda x: (x[1], 1))
                      .reduceByKey(lambda x, y: x + y)
                      .collect())

        mean_value = hist_mean(agg_result)
        std_value = hist_std(agg_result)
        median_value = hist_median(agg_result)

        stats_collector[stats_key]['mean'] = mean_value
        stats_collector[stats_key]['std'] = std_value
        stats_collector[stats_key]['median'] = median_value

        upper_limit = mean_value + return_range * std_value  # mean + return_range * standard deviation
        stats_collector[stats_key]['distribution'] = [[item[0], item[1]] for item in agg_result if
                                                      item[0] <= upper_limit]

        store_accumulator_dict(acc_dict, stats_collector)

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------hourly time interval distribution --------------------------------------------------------
    if process_properties.get('hourly_time_gap_distribution', None):
        if 'timestamp_format' not in process_properties:
            raise Exception(
                'Time distribution cannot be derived without timestamp information specified in properties.')

        column_number = process_properties['hourly_time_gap_distribution']['index']
        return_range = process_properties['hourly_time_gap_distribution'].get('return_std_range', 3)
        resolution = process_properties['hourly_time_gap_distribution'].get('resolution', 60)
        threshold = process_properties['hourly_time_gap_distribution'].get('threshold', 3600)

        column_name = data_dict[column_number]['name']
        stats_key = 'hourly_gap_distribution_' + column_name
        stats_collector[stats_key] = dict()

        agg_result = (valid_rdd.map(lambda x: (x[column_number],
                                               construct_datetime(x, process_properties['timestamp_format'],
                                                                  data_dict)))
                      .groupByKey()
                      .flatMap(lambda x: hour_gap_pair_list(x[1], resolution, threshold))
                      .filter(lambda x: (x[0][1] != None) and (x[0][1] != 0))
                      .reduceByKey(lambda x, y: x + y)
                      .collect())

        hourly_agg_result = dict()

        for item in agg_result:
            if item[0][0] not in hourly_agg_result:
                hourly_agg_result[item[0][0]] = {item[0][1]: item[1]}
            else:
                if item[0][1] not in hourly_agg_result[item[0][0]]:
                    hourly_agg_result[item[0][0]][item[0][1]] = item[1]
                else:
                    hourly_agg_result[item[0][0]][item[0][1]] += item[1]

        for hour in hourly_agg_result:
            stats_collector[stats_key][hour] = dict()
            agg_result = [[item[0], item[1]] for item in hourly_agg_result[hour].items()]

            mean_value = hist_mean(agg_result)
            std_value = hist_std(agg_result)
            median_value = hist_median(agg_result)

            stats_collector[stats_key][hour]['mean'] = mean_value
            stats_collector[stats_key][hour]['std'] = std_value
            stats_collector[stats_key][hour]['median'] = median_value

            upper_limit = mean_value + return_range * std_value  # mean + return_range * standard deviation
            stats_collector[stats_key][hour]['distribution'] = [[item[0], item[1]] for item in agg_result if
                                                                item[0] <= upper_limit]

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------time distribution of unique records ------------------------------------------------------
    if process_properties.get('unique_time_distribution', None):
        column_number_list = process_properties['unique_time_distribution']['columns']

        for column_number in column_number_list:
            column_name = data_dict[column_number]['name']

            stats_key = 'unique_time_distribution_' + column_name

            frequency = process_properties['unique_time_distribution'].get('frequency', 1)
            frequency_unit = process_properties['unique_time_distribution'].get('unit', 'hours')

            gap_rdd = (valid_rdd.map(lambda x: (x[column_number],
                                                round_datetime(
                                                    construct_datetime(x, process_properties['timestamp_format'],
                                                                       data_dict)
                                                    , frequency, frequency_unit))))

            gap_df = gap_rdd.toDF().selectExpr('_2 as DATETIME', '_1 as COLUMN')

            stats_collector[stats_key] = (gap_df.groupBy('DATETIME').agg(countDistinct('COLUMN').alias('UNIQUE_COLUMN'))
                                          .rdd.map(lambda x: [x['DATETIME'], x['UNIQUE_COLUMN']]).collect())

        store_accumulator_dict(acc_dict, stats_collector)

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------unique values of columns------------------------------------------------------------------
    if process_properties.get('unique_columns', None):
        if 'raw_df' not in locals():
            raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache()

        for index in process_properties['unique_columns']:
            unique_count = (raw_df.groupBy()
                            .agg(countDistinct(data_dict[index]['name'])
                                 .alias('DISTINCT_COUNT'))
                            .collect())
            stats_collector['unique_' + data_dict[index]['name']] = unique_count[0]['DISTINCT_COUNT']

        store_accumulator_dict(acc_dict, stats_collector)

    if process_properties.get('groupby_unique_columns', None):
        if 'raw_df' not in locals():
            raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache()

        for item in process_properties['groupby_unique_columns']:
            groupby_column = item[0]
            groupby_column_name = data_dict[item[0]]['name']
            count_unique_column = item[1]
            count_unique_column_name = data_dict[item[1]]['name']

            unique_count = (raw_df
                            .groupBy(groupby_column_name)
                            .agg(countDistinct(count_unique_column_name).alias('DISTINCT_COUNT'))
                            .collect())

            key_name = 'groupby_' + groupby_column_name + '_unique_' + count_unique_column_name
            stats_collector[key_name] = dict()

            for row in unique_count:
                if row[groupby_column_name] not in stats_collector[key_name]:
                    stats_collector[key_name][row[groupby_column_name]] = row['DISTINCT_COUNT']

        store_accumulator_dict(acc_dict, stats_collector)

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------total records groupby certain columns-----------------------------------------------------
    if process_properties.get('groupby_columns', None):
        if 'raw_df' not in locals():
            raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache()

        groupby_column_index = process_properties['groupby_columns']
        groupby_column_names = [data_dict[index]['name']
                                for index in groupby_column_index]

        groupby_stats = raw_df.groupBy(groupby_column_names).count().collect()

        for column in groupby_column_names:
            stats_collector['groupby_' + column] = dict()

        for row in groupby_stats:
            for column in groupby_column_names:
                stats_collector['groupby_' + column][row[column]] = \
                    stats_collector['groupby_' + column].get(row[column], 0) + row['count']

        store_accumulator_dict(acc_dict, stats_collector)

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------distribution of records update per imsi---------------------------------------------------
    if process_properties.get('transaction_distribution', None):
        if 'raw_df' not in locals():
            raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache()

        column_number = process_properties['transaction_distribution']['index']
        column_name = data_dict[column_number]['name']
        return_range = process_properties['transaction_distribution']['return_std_range']
        stats_key = 'records_distribution_' + column_name
        stats_collector[stats_key] = {}

        agg_by_count = (raw_df
                        .groupBy(column_name).count()
                        .selectExpr('count as num_records')
                        .groupBy('num_records')
                        .count())
        agg_result = agg_by_count.rdd.map(lambda x: [x['num_records'], x['count']]).collect()

        mean_value = hist_mean(agg_result)
        std_value = hist_std(agg_result)
        median_value = hist_median(agg_result)

        stats_collector[stats_key]['mean'] = mean_value
        stats_collector[stats_key]['std'] = std_value
        stats_collector[stats_key]['median'] = median_value

        upper_limit = mean_value + return_range * std_value  # mean + return_range * standard deviation
        stats_collector[stats_key]['distribution'] = [[item[0], item[1]] for item in agg_result if
                                                      item[0] <= upper_limit]

        store_accumulator_dict(acc_dict, stats_collector)

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------dump all results to json file-------------------------------------------------------------
    output_rdd = sc.parallelize([json.dumps(stats_collector)])
    output_rdd.saveAsTextFile(output_path)
  .load("/data/retail-data/all/*.csv")\
  .coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")


# COMMAND ----------

from pyspark.sql.functions import count
df.select(count("StockCode")).show() # 541909


# COMMAND ----------

from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show() # 4070


# COMMAND ----------

from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364


# COMMAND ----------

from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()


# COMMAND ----------