Example #1
0
def hash_mdp_id_and_subsample(df,
                              sample_range: Optional[Tuple[float,
                                                           float]] = None):
    """Since mdp_id is a string but Pytorch Tensors do not store strings,
    we hash them with crc32, which is treated as a cryptographic hash
    (with range [0, MAX_UINT32-1]). We also perform an optional subsampling
    based on this hash value.
    NOTE: we're assuming no collisions in this hash! Otherwise, two mdp_ids
    can be indistinguishable after the hash.
    TODO: change this to a deterministic subsample.
    """
    if sample_range:
        assert (0.0 <= sample_range[0] and sample_range[0] <= sample_range[1]
                and sample_range[1] <= 100.0), f"{sample_range} is invalid."

    # pyre-fixme[16]: Module `functions` has no attribute `col`.
    df = df.withColumn("mdp_id", crc32(col("mdp_id")))
    if sample_range:
        lower_bound = sample_range[0] / 100.0 * MAX_UINT32
        upper_bound = sample_range[1] / 100.0 * MAX_UINT32
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        df = df.filter((lower_bound <= col("mdp_id"))
                       & (col("mdp_id") <= upper_bound))
    return df
Example #2
0
def createDataFile(start_date, end_date, spark_instance, jackknife_buckets,
                   sample_percent, output_path):
    feature_data_phase1 = spark_instance.table(_TABLE_SOURCE).select([
        _COL_ID.alias("id"),
        _DATE_PARSED.alias("date"),
        # TODO: Use MD5 instead of CRC32
        ((F.crc32(_COL_ID) / 100) % jackknife_buckets).alias("bucket"),
        lit(1).alias("is_active"),
        F.when(_COL_URI_COUNT >= _NUM_ADAU_THRESHOLD, 1).otherwise(0).alias(
            "is_active_active"),
        F.to_date(_COL_PC_DATE).alias("profile_creation_date")
    ] + _MAP_NATURAL_DIMENSIONS.keys()).filter(
        (_DATE_PARSED.between(start_date, end_date))
        & (_COL_SAMPLE < sample_percent)).withColumn(
            "young_profile",
            F.when(
                col("date") < F.date_add(col("profile_creation_date"), 14),
                "TRUE").otherwise("FALSE"))

    new_profile_window = Window.partitionBy(col("id")).orderBy(col("date"))
    new_profile_data = feature_data_phase1.filter(
        (col("date") >= col("profile_creation_date"))
        & (col("date") <= F.date_add(col("profile_creation_date"), 6))).select(
            "*",
            F.rank().over(new_profile_window).alias('rank')).filter(
                col('rank') == 1).withColumn("new_profile",
                                             lit(1)).drop("date").withColumn(
                                                 "date",
                                                 col("profile_creation_date"))

    feature_data = feature_data_phase1.alias("fd").join(
        new_profile_data.alias("np"),
        (col("fd.id") == col("np.id")) & (col("fd.date") == col("np.date")),
        how='full',
    ).select(
        [F.coalesce(col("np.new_profile"), lit(0)).alias("new_profile")] +
        [F.coalesce(col("fd.is_active"), lit(0)).alias("is_active")] + [
            F.coalesce(col("fd.is_active_active"), lit(0)).alias(
                "is_active_active")
        ] + [
            F.coalesce(col("fd.{}".format(c)), col("np.{}".format(c))).alias(c)
            for c in feature_data_phase1.columns
            if c not in ["is_active", "is_active_active"]
        ])

    once_ever_profiles = feature_data.filter(
        col("is_active") == 1).groupBy("id").count().filter(
            col("count") == 1).select("id").withColumn("single_day_profile",
                                                       lit("1"))

    feature_data = feature_data.alias("fd").join(
        once_ever_profiles.alias("oep"), "id",
        "outer").fillna({"single_day_profile": "0"})

    feature_data.write.partitionBy("date").mode('overwrite').parquet(
        output_path)
Example #3
0
def hash_histogram(df, column_name, max_bins):
    hashed = df.select((f.crc32(f.col(column_name).cast("string")) % max_bins).alias(column_name))
    hist = hashed.groupBy(column_name).count()
    hist = hist.rdd.collectAsMap()

    if None in hist:
        hist["None"] = hist[None]
        del hist[None]

    values_total = float(sum(hist.values()))
    hist = dict((k, v / values_total) for k, v in hist.items())
    return hist
Example #4
0
def summary_df(df,fn): #,max_date):
    # drop null ad_click values
    df = df.na.drop(subset=["ad_click"])
    # Remove non search sessions
    df = df[df['ad_click']>0]

    # sum ad_click
    sum_search_clients_daily = df.groupBy("client_id", "country", "submission_date_s3", "activity_date")\
                                        .agg(F.sum("ad_click").alias("ad_click"))
        
    # read revenue_by_country
    rev_by_country_s3_path = "s3://net-mozaws-prod-us-west-2-pipeline-analysis/nawong/revenue_by_country.csv"
    rev_by_country = sqlContext.read.csv(rev_by_country_s3_path, header=True)
    rev_by_country = rev_by_country.withColumn("rev_per_search_float", F.col("rev_per_search").cast("double"))\
                               .withColumn("yyyyMM_timestamp", F.to_timestamp(F.col("yyyymm"), "yyyyMM"))\
                               .withColumn("country_code", F.upper(F.col("country_code")))

    # add country field and revenue table - need transform to calculate transaction-level monetary value
    tbl = sum_search_clients_daily.join(rev_by_country, sum_search_clients_daily.country == rev_by_country.country_code,how='left_outer')
    spec = Window.partitionBy("client_id","country","submission_date_s3").orderBy(F.col("yyyyMM_timestamp").desc())
    # NOTE partition includes country because client may change country over time

    no_country=(
    tbl
        .where(F.isnull(F.col("yyyymm")))
        .withColumn("rev_per_search_float", F.lit(.005))
    )

    has_country=(
    tbl
        .na.drop(subset=["yyyymm"])
        .where("yyyyMM_timestamp <= activity_date")
        .withColumn('rank', F.row_number().over(spec))
        .where("rank = 1")
        .drop('rank')
    )

    tbl2=(no_country.union(has_country))

    # drop first purchase to calculate revenue
    spec2 = Window.partitionBy("client_id").orderBy(F.col("activity_date").asc()) # earliest date has row #1
    search_rev = (tbl2
     .withColumn("rank", F.row_number().over(spec2))
     .where("rank > 1")
    ).groupBy("client_id").agg(F.avg(F.col('rev_per_search_float')*F.col('ad_click')).alias("monetary_value"))
    
    # compute the final dataset for the BG/NBD model
    dataset = (
        tbl2
        .groupBy("client_id")
        .agg(F.datediff(F.max('activity_date'),F.min("activity_date")).alias("recency"), 
             (F.countDistinct('activity_date')-1).alias("frequency"),
             (F.datediff(F.lit(end_date.strftime("%Y-%m-%d")).cast("date"),F.min("activity_date"))).alias("T"),
             F.sum("ad_click").alias("historical_searches"),
             F.sum(F.col('rev_per_search_float')*F.col('ad_click')).alias("historical_clv"))
        .join(search_rev, "client_id", how="left")
        .where("frequency >= 0 AND recency >= 0 AND T >= 0")
        .select("client_id", (F.crc32("client_id") % 100).alias("sample_id"), "frequency","recency","T","monetary_value","historical_searches","historical_clv")
      ).fillna(0, subset=['monetary_value'])

    # anonymize client_id    
    dataset = dataset.withColumn('client_id',sha1(dataset.client_id))

    # write dataset recency, freq, age, revenue table per client
    #dataset.write.partitionBy("sample_id").format("parquet").mode("overwrite").save(fn)
    duplicated = dataset.withColumn("sample_id_dupe", dataset["sample_id"])
    duplicated.write.partitionBy("sample_id_dupe").format("parquet").mode("append").save(fn)