def get_df_file_rse_ts_size(df_replicas_j_dids):
    """Combines columns to get filled and correct values from join of DIDS and REPLICAS

    Firstly, REPLICAS size value will be used. If there are files with no size values, DIDS size values will be used:
    see 'when' function order. For accessed_at and created_at, their max values will be got.

    Columns: file, rse_id, accessed_at, f_size, created_at

    df_file_rse_ts_size: files and their rse_id, size and access time are completed
    """

    # f_size is not NULL, already verified.
    # df_file_rse_ts_size.filter(col('f_size').isNull()).limit(5).toPandas()
    return df_replicas_j_dids \
        .withColumn('f_size',
                    when(col('f_size_replicas').isNotNull(), col('f_size_replicas'))
                    .when(col('f_size_dids').isNotNull(), col('f_size_dids'))
                    ) \
        .withColumn('accessed_at',
                    greatest(col('dids_accessed_at'), col('rep_accessed_at'))
                    ) \
        .withColumn('created_at',
                    greatest(col('dids_created_at'), col('rep_created_at'))
                    ) \
        .select(['f_name', 'rse_id', 'accessed_at', 'f_size', 'created_at']) \
        .cache()
Example #2
0
    def prepare_df(df):
        num_rows = df.count()

        # Expand dates.
        df = expand_date(df)

        df = df \
            .withColumn('Open', df.Open != '0') \
            .withColumn('Promo', df.Promo != '0') \
            .withColumn('StateHoliday', df.StateHoliday != '0') \
            .withColumn('SchoolHoliday', df.SchoolHoliday != '0')

        # Merge in store information.
        store = store_csv.join(store_states_csv, 'Store')
        df = df.join(store, 'Store')

        # Merge in Google Trend information.
        google_trend_all = prepare_google_trend()
        df = df.join(google_trend_all, ['State', 'Year', 'Week']).select(df['*'], google_trend_all.trend)

        # Merge in Google Trend for whole Germany.
        google_trend_de = google_trend_all[google_trend_all.file == 'Rossmann_DE']
        google_trend_de = google_trend_de.withColumnRenamed('trend', 'trend_de')
        df = df.join(google_trend_de, ['Year', 'Week']).select(df['*'], google_trend_de.trend_de)

        # Merge in weather.
        weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName)
        df = df.join(weather, ['State', 'Date'])

        # Fix null values.
        df = df \
            .withColumn('CompetitionOpenSinceYear', F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900))) \
            .withColumn('CompetitionOpenSinceMonth', F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1))) \
            .withColumn('Promo2SinceYear', F.coalesce(df.Promo2SinceYear, F.lit(1900))) \
            .withColumn('Promo2SinceWeek', F.coalesce(df.Promo2SinceWeek, F.lit(1)))

        # Days & months competition was open, cap to 2 years.
        df = df.withColumn('CompetitionOpenSince',
                           F.to_date(F.format_string('%s-%s-15', df.CompetitionOpenSinceYear,
                                                     df.CompetitionOpenSinceMonth)))
        df = df.withColumn('CompetitionDaysOpen',
                           F.when(df.CompetitionOpenSinceYear > 1900,
                                  F.greatest(F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince))))
                           .otherwise(0))
        df = df.withColumn('CompetitionMonthsOpen', (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

        # Days & weeks of promotion, cap to 25 weeks.
        df = df.withColumn('Promo2Since',
                           F.expr('date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'))
        df = df.withColumn('Promo2Days',
                           F.when(df.Promo2SinceYear > 1900,
                                  F.greatest(F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))))
                           .otherwise(0))
        df = df.withColumn('Promo2Weeks', (df.Promo2Days / 7).cast(T.IntegerType()))

        # Check that we did not lose any rows through inner joins.
        assert num_rows == df.count(), 'lost rows in joins'
        return df
def algorithm1(i, g):
    while (True):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=F.when(
                                             AM.src['value'] == -1,
                                             AM.src["id"]))

        new_vertices = g.vertices.join(
            aggregates, on="id", how="left_outer").withColumn(
                "newValue",
                getid_maximum_udf2("id", "agg", lit(i),
                                   "value")).drop("agg").withColumn(
                                       'max_by_rows',
                                       greatest('value', 'newValue')).drop(
                                           "value",
                                           "newValue").withColumnRenamed(
                                               "max_by_rows", "value")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        i += 1
        g.vertices.show()
        g.vertices.createOrReplaceTempView("temp_table")
        if (spark.sql("SELECT * from temp_table where value = -1").count() == 0
            ):
            final_df = g.vertices
            break
    return final_df
def algorithm2(i, g):
    while (True):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=F.when(
                                             AM.src['value'] == -1,
                                             AM.src["id"]))

        new_vertices = g.vertices.join(
            aggregates, on="id", how="left_outer").withColumn(
                "newValue",
                getid_maximum_udf2("id", "agg", lit(i),
                                   "value")).drop("agg").withColumn(
                                       'max_by_rows',
                                       greatest('value', 'newValue')).drop(
                                           "value",
                                           "newValue").withColumnRenamed(
                                               "max_by_rows", "value")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        i += 1
        g.vertices.show()
        if (g.filterVertices(
                "value == -1").dropIsolatedVertices().edges.count() == 0):
            final_df = g.vertices
            final_df = final_df.withColumn(
                "value",
                F.when(final_df["value"] == -1,
                       i).otherwise(final_df["value"]))
            break
    return final_df
Example #5
0
def glean_2(invoice_df, line_item_df):
    invoice = invoice_df.alias('invoice')
    line_item = line_item_df.alias('line_item')
    joined_table = invoice.join(line_item,
                                invoice.invoice_id == line_item.invoice_id,
                                how='left')
    glean2 = joined_table.groupBy(
        invoice.invoice_id, invoice.canonical_vendor_id, invoice.invoice_date,
        invoice.period_end_date).agg(
            max_('line_item.period_end_date').alias('max_line_end_date'))
    glean2 = glean2.withColumn(
        'end_date', funcs.greatest('max_line_end_date',
                                   invoice.period_end_date))
    glean2 = glean2.withColumn('difference',
                               funcs.datediff('end_date', 'invoice_date'))
    glean2 = glean2[glean2['difference'] > 90]
    glean2 = glean2.withColumn('glean_location', funcs.lit('invoice'))
    glean2 = glean2.withColumn("glean_type", funcs.lit('accrual_alert'))
    glean2 = glean2.withColumn(
        "glean_text",
        funcs.concat(
            funcs.lit('Line items from vendor '),
            funcs.col('canonical_vendor_id'),
            funcs.lit(' in this invoice cover future periods (through '),
            funcs.col('end_date'), funcs.lit(' )')))
    glean2 = glean2.withColumn('glean_date', funcs.col('invoice_date'))
    return glean2
Example #6
0
 def calc_min_max():
     if len(sdf.columns) > 1:
         min_col = F.least(*map(F.min, sdf))
         max_col = F.greatest(*map(F.max, sdf))
     else:
         min_col = F.min(sdf.columns[-1])
         max_col = F.max(sdf.columns[-1])
     return sdf.select(min_col, max_col).first()
Example #7
0
def compile_greatest(t, expr, scope, **kwargs):
    op = expr.op()

    src_columns = t.translate(op.arg, scope)
    if len(src_columns) == 1:
        return src_columns[0]
    else:
        return F.greatest(*src_columns)
    def user_item_serendipity(self):
        """Calculate serendipity of each item in the recommendations for each user.
        The metric definition is based on the following references:

        :Citation:

            Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
            introducing serendipity into music recommendation, WSDM 2012

            Eugene Yan, Serendipity: Accuracy’s unpopular best friend in Recommender Systems,
            eugeneyan.com, April 2020

        Returns:
            pyspark.sql.dataframe.DataFrame: A dataframe with columns: col_user, col_item, user_item_serendipity.
        """
        # for every col_user, col_item in reco_df, join all interacted items from train_df.
        # These interacted items are repeated for each item in reco_df for a specific user.
        if self.df_user_item_serendipity is None:
            self.df_cosine_similarity = self._get_cosine_similarity()
            self.df_user_item_serendipity = (
                self.reco_df.select(
                    self.col_user,
                    self.col_item,
                    F.col(self.col_item).alias(
                        "reco_item_tmp"
                    ),  # duplicate col_item to keep
                )
                .join(
                    self.train_df.select(
                        self.col_user, F.col(self.col_item).alias("train_item_tmp")
                    ),
                    on=[self.col_user],
                )
                .select(
                    self.col_user,
                    self.col_item,
                    F.least(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias(
                        "i1"
                    ),
                    F.greatest(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias(
                        "i2"
                    ),
                )
                .join(self.df_cosine_similarity, on=["i1", "i2"], how="left")
                .fillna(0)
                .groupBy(self.col_user, self.col_item)
                .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim"))
                .join(self.reco_df, on=[self.col_user, self.col_item])
                .withColumn(
                    "user_item_serendipity",
                    (1 - F.col("avg_item2interactedHistory_sim"))
                    * F.col(self.col_relevance),
                )
                .select(self.col_user, self.col_item, "user_item_serendipity")
                .orderBy(self.col_user, self.col_item)
            )
        return self.df_user_item_serendipity
Example #9
0
 def __fence(df, values):
     colname, (lfence, ufence) = list(values.items())[0]
     # Generates two columns, for lower and upper fences
     # and then applies `greatest` and `least` functions
     # to effectively fence the values.
     return (df.withColumn('__fence', F.lit(lfence))
             .withColumn(colname, F.greatest(colname, '__fence'))
             .withColumn('__fence', F.lit(ufence))
             .withColumn(colname, F.least(colname, '__fence'))
             .drop('__fence'))
 def Silhouette(self,
                clusterlab=None,
                sildistmethod="Euclidean",
                silfilter_str=None,
                dbname="risk"):
     '''return Silhouette index'''
     from pyspark.sql.functions import col, avg, greatest
     import pyspark.sql.functions as F
     if clusterlab == None:
         clusterlab = self.clusterLabelCol
     prwise_cart = self.pairwise_dist(self.df.select([self.idcol] +
                                                     self.varnames),
                                      distmethod=sildistmethod,
                                      filter_str=silfilter_str)
     prwise_cart.createOrReplaceTempView("pairwise_dist")
     hive_context.sql("drop table if exists " + dbname + ".pairwise_dist")
     hive_context.sql(
         "create table pairwise_dist as select * from pairwise_dist")
     del prwise_cart
     prwise_cart = hive_context.table("pairwise_dist")
     ID_cluster_link = self.df.select([self.idcol, clusterlab])
     ID_cluster_link.createOrReplaceTempView("id_cluster")
     hive_context.sql("drop table if exists " + dbname + ".id_cluster")
     hive_context.sql("create table id_cluster as select * from id_cluster")
     del ID_cluster_link
     ID_cluster_link = hive_context.table("id_cluster")
     #a big cartisan join for pairwise points, coumputation is N^2 mapping
     ID_pairwise_cart = prwise_cart.alias("dist").join(ID_cluster_link.alias("id1"),col("id1."+self.idcol)==col("dist.ID1"))\
                                         .join(ID_cluster_link.alias("id2"),col("id2."+self.idcol)==col("dist.ID2"))\
                                         .selectExpr("id1."+clusterlab+" as ID1_"+clusterlab,\
                                                     "id2."+clusterlab+" as ID2_"+clusterlab,\
                                                     "dist.*")
     #point i to other cluster's average's min
     ID_pairwise_bi = ID_pairwise_cart.filter("ID1_" + clusterlab +" <> "+"ID2_" + clusterlab)\
                                     .groupBy("ID1_" + clusterlab,"ID1","ID2_" + clusterlab)\
                                     .agg(avg(sildistmethod+"_distance").alias("avg_distance_"+"ID2"+clusterlab))
     ID_pairwise_bi = ID_pairwise_bi.groupBy(
         "ID1_" + clusterlab, "ID1").agg(
             F.min("avg_distance_" + "ID2" + clusterlab).alias("b_i"))
     #point i to self cluster's average
     ID_pairwise_ai = ID_pairwise_cart.filter("ID1_" + clusterlab +" = "+"ID2_" + clusterlab)\
                                     .groupBy("ID1_" + clusterlab,"ID1")\
                                     .agg(avg(sildistmethod+"_distance").alias("a_i"))
     #calculate the bi-ai / max(ai bi) formula
     ID_pairwise_aibi = ID_pairwise_ai.alias("a").join(ID_pairwise_bi.alias("b"), ID_pairwise_ai.ID1 == ID_pairwise_bi.ID1)\
                                                 .selectExpr("a.*","b.b_i")
     #calculate silhouette for each data point
     ID_pairwise_aibi = ID_pairwise_aibi.withColumn(
         "silouette", (ID_pairwise_aibi["b_i"] - ID_pairwise_aibi["a_i"]) /
         greatest(ID_pairwise_aibi["a_i"], ID_pairwise_aibi["b_i"]))
     Silhouette = ID_pairwise_aibi.select("silouette").agg(
         avg("silouette")).collect()[0][0]
     hive_context.sql("drop table if exists " + dbname + ".pairwise_dist")
     hive_context.sql("drop table if exists " + dbname + ".id_cluster")
     return (Silhouette, ID_pairwise_aibi)
Example #11
0
def build_tsne_matrix(
        spark,
        latent_matrix,
        genre_df='hdfs:/user/yw2115/gooreads_book_genres_initial.json.gz',
        save_csv='tsne_matrix.csv'):
    """
    saves the csv for the tsne plot in viz.py
    # reference: https://stackoverflow.com/questions/46179453/how-to-compute-maximum-per-row-and-return-a-colum-of-max-value-and-another-colu

    genre_df: hdfs:/user/yw2115/gooreads_book_genres_initial.json.gz, downloaded from goodreads online
    latent_matrix: output from load_latent(model)

    return: None
    saves: data structure with bookid, lf's from the model, and genre matched
    """

    from pyspark.sql.types import StringType
    from pyspark.sql.functions import col, greatest, udf, array
    import pyspark.sql.functions as f

    genre_df = spark.read.json(genre_df)

    genre_at = genre_df.select('book_id',f.expr('genres.children'),f.expr('genres.`comics, graphic`'),\
        f.expr('genres.`fantasy, paranormal`'),f.expr('genres.fiction'), \
        f.expr('genres.`history, historical fiction, biography`'), f.expr('genres.`mystery, thriller, crime`'),\
        f.expr('genres.`non-fiction`'),f.expr('genres.poetry'),f.expr('genres.romance'),f.expr('genres.`young-adult`'))
    #genre_at = genre_at.toDF()
    #genre_only = genre_at.drop('book_id')

    df1 = genre_at.withColumn(
        "maxValue", greatest(*[col(x) for x in genre_at.columns[1:]]))

    col_arr = df1.columns

    def modify_values(r):
        for i in range(len(r[:-1])):
            if r[i] == r[-1]:
                return col_arr[i]

    modify_values_udf = udf(modify_values, StringType())

    df1 = df1.withColumn("maxColumn", modify_values_udf(array(df1.columns)))
    book_genre = df1.select('book_id', 'maxColumn')

    tsne_matrix = latent_matrix.join(book_genre, on='book_id', how='inner')

    #tsne_matrix.createOrReplaceTempView('spark_df')
    #books = spark.sql('SELECT DISTINCT book_id FROM spark_df')
    #splits = books.randomSplit([0.25, 0.75], seed=42)
    #book_samp = splits[0]

    # save to csv for py script
    tsne_matrix.coalesce(1).write.csv(save_csv)
def main():

    spark = SparkSession \
      .builder \
      .appName("Supported_Tables_Aggregations") \
      .getOrCreate()

    sc = spark.sparkContext
    sc._jsc.hadoopConfiguration().set(
        "fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId",
                                      os.environ['AWS_ACCESS_KEY_ID'])
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey",
                                      os.environ['AWS_SECRET_ACCESS_KEY'])

    # load zipcode and income datasets from S3
    # df_review = spark.read.json("s3n://susiehuang-s3/yelp_json_all/yelp_academic_dataset_business.json")

    df_census = spark.read.format("csv").option(
        "header",
        "true").load("s3n://susiehuang-s3/yelp_all_csv/census_data.csv")
    df_zipcode1 = spark.read.format("csv").option(
        "header",
        "true").load("s3n://susiehuang-s3/yelp_all_csv/zipcode_county.csv")
    df_zipcode = df_zipcode1.select(
        col("zip_code").alias("zipcode"),
        col("state").alias("state_code"),
        col("county").alias("County"),
        col("city").alias("City"))

    # data transformation for income and zipcode datasets
    df1 = df_census.join(df_zipcode, (df_census.County == df_zipcode.County))

    cond = "psf.when" + ".when".join([
        "(psf.col('" + c + "') == psf.col('max_value'), psf.lit('" + c + "'))"
        for c in df1.columns
        if c in ['Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific']
    ])

    df2= df1.withColumn("max_value", psf.greatest(df1.Hispanic,df1.White, df1.Black, df1.Native, df1.Asian, df1.Pacific))\
        .withColumn("MAX_Racial", eval(cond))

    df3 = df2.select('zipcode', 'state_code', 'State', 'County', 'City',
                     'Income', 'max_value', 'MAX_Racial')

    # export to DB
    df3.createOrReplaceTempView("zipcode_income_table")
    output = spark.sql("SELECT * FROM zipcode_income_table")
    output.write.format('jdbc').options(url='jdbc:xxx://10.0.0.7/business',
                                        driver='com.xxx.jdbc.Driver',
                                        dbtable='zipcode_income',
                                        user='******',
                                        password='******').mode('append').save()
def add_domtopic(df):
    """
    find the dominant topic of each sample/row/document
    input: dataframe of weight of each topic
    output: the raw dominant topic number dataframe
    """

    argmax_udf = lambda cols: F.udf(lambda *args: argmax(cols, *args),
                                    StringType())
    return (df.withColumn(
        'domtopic',
        argmax_udf(df.columns[2:])(*df.columns[2:])).withColumn(
            'weight', F.greatest(*[F.col(x) for x in df.columns[2:-1]])))
Example #14
0
def test_greatest(data_gen):
    num_cols = 20
    s1 = gen_scalar(data_gen, force_no_nulls=True)
    # we want lots of nulls
    gen = StructGen(
        [('_c' + str(x), data_gen.copy_special_case(None, weight=100.0))
         for x in range(0, num_cols)],
        nullable=False)
    command_args = [f.col('_c' + str(x)) for x in range(0, num_cols)]
    command_args.append(s1)
    data_type = data_gen.data_type
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: gen_df(spark, gen).select(f.greatest(*command_args)))
Example #15
0
def quotient(primary_col: str, secondary_col: str, output_col: str,
             df: DataFrame):
    """The quotient is simply the minimum value divided by the maximum value
    Note that if the values are the same this will result in a score of 1.0,
    but if the values are very different this will result in scores close to 0.0"""

    return df.withColumn(
        output_col,
        F.when(
            F.col(primary_col).isNull() | F.col(secondary_col).isNull(),
            None).otherwise(
                F.least(F.col(primary_col), F.col(secondary_col)) /
                F.greatest(F.col(primary_col), F.col(secondary_col))),
    )
Example #16
0
    def get_bins(sdf, bins):
        # 'data' is a Spark DataFrame that selects all columns.
        if len(sdf.columns) > 1:
            min_col = F.least(*map(F.min, sdf))
            max_col = F.greatest(*map(F.max, sdf))
        else:
            min_col = F.min(sdf.columns[-1])
            max_col = F.max(sdf.columns[-1])
        boundaries = sdf.select(min_col, max_col).first()

        # divides the boundaries into bins
        if boundaries[0] == boundaries[1]:
            boundaries = (boundaries[0] - 0.5, boundaries[1] + 0.5)

        return np.linspace(boundaries[0], boundaries[1], bins + 1)
Example #17
0
        def get_predictions(label_col):
            pred_col = filter(lambda x: label_col in x, preds_cols)

            cond_lst = [
                F.when(
                    F.col(_x) == F.col("max_value"),
                    F.lit(_x.split("_{}_".format(label_col))[-1]))
                for _x in pred_col
            ]

            cond = reduce(lambda left, right: left.otherwise(right), cond_lst)

            tmp_sdf = preds_sdf.withColumn("max_value", F.greatest(*(F.col(c) for c in preds_cols)))\
                .withColumn(label_col, cond)

            return tmp_sdf.select(["id2", label_col])
def generate_graphs(data_1, data_2, name_1, name_2):
    def group_by_product(data):
        avg_stars = data.groupby('product_id', 'product_title').agg(f.avg('star_rating'), 
                                                                    f.count('review_id'), 
                                                                    f.min(to_date(data['review_date'], 'yyyy-MM-dd')),
                                                                    f.stddev('star_rating'))
        avg_stars = avg_stars.withColumnRenamed('count(review_id)', 'n_reviews') \
                            .withColumnRenamed('avg(star_rating)', 'rating') \
                            .withColumnRenamed("min(to_date(`review_date`, 'yyyy-MM-dd'))", 'first_date') \
                            .withColumnRenamed('stddev_samp(star_rating)', 'std_rating')
        return avg_stars
        
    avg_1 = group_by_product(data_1)
    avg_2 = group_by_product(data_2)


    # To be able to differenciate columns after a later join
    c1 = avg_1.alias("c1")
    c2 = avg_2.alias("c2")

    c1_c2 = c1.join(c2, f.col('c1.product_id') == f.col('c2.product_id'))
    latest_date = c1_c2.select(f.col('c1.product_id'),greatest(f.col('c1.first_date'), f.col('c2.first_date'))) \
                        .withColumnRenamed("product_id", "id").withColumnRenamed("greatest(c1.first_date, c2.first_date)", 'latest_date')

    c1_common_with_date = data_1.join(latest_date, data_1['product_id'] == latest_date['id'])
    c1_common_reviews = c1_common_with_date.where('review_date >= latest_date')

    c2_common_with_date = data_2.join(latest_date, data_2['product_id'] == latest_date['id'])
    c2_common_reviews = c2_common_with_date.where('review_date >= latest_date')

    common_c1_avg = group_by_product(c1_common_reviews)
    common_c2_avg = group_by_product(c2_common_reviews)

    c1_pd = common_c1_avg.toPandas()
    c2_pd = common_c2_avg.toPandas()

    plt.figure(figsize=(10,6))
    plt.boxplot([c1_pd['rating'], c2_pd['rating']], 0, sym='',autorange=True, labels=[name_1, name_2])
    plt.title('Distribution of the average ratings / product - '+ name_1 + " vs " + name_2)
    plt.ylabel('Average rating')
    plt.ylim(2.4, 5.1)
    plt.savefig(IMG_PATH + "countries/average_rating_" + name_1 + "_" + name_2 + ".png", bbox_inches='tight')
    plt.clf()
def prepare_df(
    df: pyspark.sql.DataFrame,
    store_csv: pyspark.sql.DataFrame,
    store_states_csv: pyspark.sql.DataFrame,
    state_names_csv: pyspark.sql.DataFrame,
    google_trend_csv: pyspark.sql.DataFrame,
    weather_csv: pyspark.sql.DataFrame,
) -> pyspark.sql.DataFrame:
    num_rows = df.count()

    # expand dates
    df = expand_date(df)

    # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed).
    df = (df.withColumn("Open", df.Open != "0").withColumn(
        "Promo",
        df.Promo != "0").withColumn("StateHoliday",
                                    df.StateHoliday != "0").withColumn(
                                        "SchoolHoliday",
                                        df.SchoolHoliday != "0"))

    # merge store information
    store = store_csv.join(store_states_csv, "Store")
    df = df.join(store, "Store")

    # merge Google Trend information
    google_trend_all = prepare_google_trend(google_trend_csv)
    df = df.join(google_trend_all,
                 ["State", "Year", "Week"]).select(df["*"],
                                                   google_trend_all.trend)

    # merge in Google Trend for whole Germany
    google_trend_de = google_trend_all[google_trend_all.file ==
                                       "Rossmann_DE"].withColumnRenamed(
                                           "trend", "trend_de")
    df = df.join(google_trend_de,
                 ["Year", "Week"]).select(df["*"], google_trend_de.trend_de)

    # merge weather
    weather = weather_csv.join(state_names_csv,
                               weather_csv.file == state_names_csv.StateName)
    df = df.join(weather, ["State", "Date"])

    # fix null values
    df = (df.withColumn(
        "CompetitionOpenSinceYear",
        F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)),
    ).withColumn(
        "CompetitionOpenSinceMonth",
        F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)),
    ).withColumn("Promo2SinceYear",
                 F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn(
                     "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek,
                                                   F.lit(1))))

    # days and months since the competition has been open, cap it to 2 years
    df = df.withColumn(
        "CompetitionOpenSince",
        F.to_date(
            F.format_string("%s-%s-15", df.CompetitionOpenSinceYear,
                            df.CompetitionOpenSinceMonth)),
    )
    df = df.withColumn(
        "CompetitionDaysOpen",
        F.when(
            df.CompetitionOpenSinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(360 * 2),
                        F.datediff(df.Date, df.CompetitionOpenSince)),
            ),
        ).otherwise(0),
    )
    df = df.withColumn("CompetitionMonthsOpen",
                       (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

    # days and weeks of promotion, cap it to 25 weeks
    df = df.withColumn(
        "Promo2Since",
        F.expr(
            'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'
        ),
    )
    df = df.withColumn(
        "Promo2Days",
        F.when(
            df.Promo2SinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))),
        ).otherwise(0),
    )
    df = df.withColumn("Promo2Weeks",
                       (df.Promo2Days / 7).cast(T.IntegerType()))

    # ensure that no row was lost through inner joins
    assert num_rows == df.count(), "lost rows in joins"
    return df
Example #20
0
def assignment_transformation(expedia_df, hotels_weather_df):
    # enriching booking data with avg temp on the srch_ci and duration of stay
    expedia_enriched = (expedia_df.join(
        hotels_weather_df, (expedia_df.hotel_id == hotels_weather_df.id) &
        (expedia_df.srch_ci == hotels_weather_df.wthr_date)).select(
            expedia_df["*"], hotels_weather_df.avg_c).withColumn(
                "duration_of_stay", datediff(col("srch_co"), col("srch_ci"))))
    # enriching expedia data with stay types for further summing
    stay_data = (expedia_enriched.withColumn(
        "short_stay",
        when(col("duration_of_stay") == 1, 1).otherwise(0)).withColumn(
            "erroneous_data",
            when(
                (col("duration_of_stay") <= 0) | (col("duration_of_stay") > 30)
                | (col("duration_of_stay").isNull()),
                1).otherwise(0)).withColumn(
                    "standard_stay",
                    when((col("duration_of_stay") >= 2) &
                         (col("duration_of_stay") < 7),
                         1).otherwise(0)).withColumn(
                             "standard_extended_stay",
                             when((col("duration_of_stay") >= 8) &
                                  (col("duration_of_stay") < 14),
                                  1).otherwise(0)).withColumn(
                                      "long_stay",
                                      when((col("duration_of_stay") >= 15) &
                                           (col("duration_of_stay") < 30),
                                           1).otherwise(0)).withColumn(
                                               "batch_timestamp",
                                               current_timestamp()))
    # if dataframe is streaming for aggregation we need to define a watermark period.
    stay_data = stay_data.withWatermark(
        "batch_timestamp", "1 minute") if stay_data.isStreaming else stay_data

    # grouping and calculating stay types for each hotel
    cnt = (stay_data.groupBy("hotel_id", "batch_timestamp").agg(
        sum("short_stay").alias("short_stay_cnt"),
        sum("erroneous_data").alias("erroneous_data_cnt"),
        sum("standard_stay").alias("standard_stay_cnt"),
        sum("standard_extended_stay").alias("standard_extended_stay_cnt"),
        sum("long_stay").alias("long_stay_cnt")))

    # calculating most popular stay type for each hotel
    return (cnt.withColumn(
        "popular_stay_cnt",
        greatest("erroneous_data_cnt", "short_stay_cnt", "standard_stay_cnt",
                 "standard_extended_stay_cnt", "long_stay_cnt")
    ).withColumn(
        "most_popular_stay_type",
        when(
            col("popular_stay_cnt") == cnt["erroneous_data_cnt"],
            "Erroneous data").when(
                col("popular_stay_cnt") == cnt["short_stay_cnt"],
                "Short stay").when(
                    col("popular_stay_cnt") == cnt["standard_stay_cnt"],
                    "Standard stay").when(
                        col("popular_stay_cnt") ==
                        cnt["standard_extended_stay_cnt"],
                        "Standard extended stay").when(
                            col("popular_stay_cnt") == cnt["long_stay_cnt"],
                            "Long stay")).select(
                                cnt["*"], col("most_popular_stay_type")))
Example #21
0
def tocolumns(df, expr):
    import pyspark.sql.functions as fcns

    if isinstance(expr, histbook.expr.Const):
        return fcns.lit(expr.value)

    elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)):
        return df[expr.value]

    elif isinstance(expr, histbook.expr.Call):
        if expr.fcn == "abs" or expr.fcn == "fabs":
            return fcns.abs(tocolumns(df, expr.args[0]))
        elif expr.fcn == "max" or expr.fcn == "fmax":
            return fcns.greatest(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "min" or expr.fcn == "fmin":
            return fcns.least(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "arccos":
            return fcns.acos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arccosh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arcsin":
            return fcns.asin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arcsinh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arctan2":
            return fcns.atan2(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "arctan":
            return fcns.atan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arctanh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "ceil":
            return fcns.ceil(tocolumns(df, expr.args[0]))
        elif expr.fcn == "copysign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "cos":
            return fcns.cos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "cosh":
            return fcns.cosh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "rad2deg":
            return tocolumns(df, expr.args[0]) * (180.0 / math.pi)
        elif expr.fcn == "erfc":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "erf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp":
            return fcns.exp(tocolumns(df, expr.args[0]))
        elif expr.fcn == "expm1":
            return fcns.expm1(tocolumns(df, expr.args[0]))
        elif expr.fcn == "factorial":
            return fcns.factorial(tocolumns(df, expr.args[0]))
        elif expr.fcn == "floor":
            return fcns.floor(tocolumns(df, expr.args[0]))
        elif expr.fcn == "fmod":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "gamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "hypot":
            return fcns.hypot(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "isinf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isnan":
            return fcns.isnan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "lgamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "log10":
            return fcns.log10(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log1p":
            return fcns.log1p(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log":
            return fcns.log(tocolumns(df, expr.args[0]))
        elif expr.fcn == "pow":
            return fcns.pow(tocolumns(df, expr.args[0]),
                            tocolumns(df, expr.args[1]))
        elif expr.fcn == "deg2rad":
            return tocolumns(df, expr.args[0]) * (math.pi / 180.0)
        elif expr.fcn == "sinh":
            return fcns.sinh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sin":
            return fcns.sin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sqrt":
            return fcns.sqrt(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tanh":
            return fcns.tanh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tan":
            return fcns.tan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "trunc":
            raise NotImplementedError(
                expr.fcn)  # FIXME (fcns.trunc is for dates)
        elif expr.fcn == "xor":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "conjugate":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "heaviside":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isfinite":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "left_shift" and isinstance(expr.args[1],
                                                     histbook.expr.Const):
            return fcns.shiftLeft(tocolumns(df, expr.args[0]),
                                  expr.args[1].value)
        elif expr.fcn == "log2":
            return fcns.log2(tocolumns(df, expr.args[0]))
        elif expr.fcn == "logaddexp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "logaddexp":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "mod" or expr.fcn == "fmod":
            return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1])
        elif expr.fcn == "right_shift" and isinstance(expr.args[1],
                                                      histbook.expr.Const):
            return fcns.shiftRight(tocolumns(df, expr.args[0]),
                                   expr.args[1].value)
        elif expr.fcn == "rint":
            return fcns.rint(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "where":
            return fcns.when(tocolumns(df, expr.args[0]),
                             tocolumns(df, expr.args[1])).otherwise(
                                 tocolumns(df, expr.args[2]))
        elif expr.fcn == "numpy.equal":
            return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.not_equal":
            return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less":
            return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less_equal":
            return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.isin":
            return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_not":
            return ~tocolumns(df, expr.args[0])
        elif expr.fcn == "numpy.add":
            return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.subtract":
            return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.multiply":
            return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.true_divide":
            return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_or":
            return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_and":
            return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1])
        else:
            raise NotImplementedError(expr.fcn)

    else:
        raise AssertionError(expr)
Example #22
0
def fillspark(hist, df):
    import pyspark.sql.functions as fcns

    indexes = []
    for axis in hist._group + hist._fixed:
        exprcol = tocolumns(df, histbook.instr.totree(axis._parsed))

        if isinstance(axis, histbook.axis.groupby):
            indexes.append(exprcol)

        elif isinstance(axis, histbook.axis.groupbin):
            scaled = (exprcol - float(axis.origin)) * (1.0 /
                                                       float(axis.binwidth))
            if axis.closedlow:
                discretized = fcns.floor(scaled)
            else:
                discretized = fcns.ceil(scaled) - 1
            indexes.append(
                fcns.nanvl(
                    discretized * float(axis.binwidth) + float(axis.origin),
                    fcns.lit("NaN")))

        elif isinstance(axis, histbook.axis.bin):
            scaled = (exprcol -
                      float(axis.low)) * (int(axis.numbins) /
                                          (float(axis.high) - float(axis.low)))
            if axis.closedlow:
                discretized = fcns.floor(scaled) + 1
            else:
                discretized = fcns.ceil(scaled)
            indexes.append(
                fcns.when(
                    fcns.isnull(exprcol) | fcns.isnan(exprcol),
                    int(axis.numbins) + 2).otherwise(
                        fcns.greatest(
                            fcns.lit(0),
                            fcns.least(fcns.lit(int(axis.numbins) + 1),
                                       discretized))))

        elif isinstance(axis, histbook.axis.intbin):
            indexes.append(
                fcns.greatest(
                    fcns.lit(0),
                    fcns.least(fcns.lit(int(axis.max) - int(axis.min) + 1),
                               fcns.round(exprcol - int(axis.min) + 1))))

        elif isinstance(axis, histbook.axis.split):

            def build(x, i):
                if i < len(axis.edges):
                    if axis.closedlow:
                        return build(x.when(exprcol < float(axis.edges[i]), i),
                                     i + 1)
                    else:
                        return build(
                            x.when(exprcol <= float(axis.edges[i]), i), i + 1)
                else:
                    return x.otherwise(i)

            indexes.append(
                build(
                    fcns.when(
                        fcns.isnull(exprcol) | fcns.isnan(exprcol),
                        len(axis.edges) + 1), 0))

        elif isinstance(axis, histbook.axis.cut):
            indexes.append(fcns.when(exprcol, 0).otherwise(1))

        else:
            raise AssertionError(axis)

    aliasnum = [-1]

    def alias(x):
        aliasnum[0] += 1
        return x.alias("@" + str(aliasnum[0]))

    index = alias(fcns.struct(*indexes))

    selectcols = [index]
    if hist._weightoriginal is not None:
        weightcol = tocolumns(df, histbook.instr.totree(hist._weightparsed))
    for axis in hist._profile:
        exprcol = tocolumns(df, histbook.instr.totree(axis._parsed))
        if hist._weightoriginal is None:
            selectcols.append(alias(exprcol))
            selectcols.append(alias(exprcol * exprcol))
        else:
            selectcols.append(alias(exprcol * weightcol))
            selectcols.append(alias(exprcol * exprcol * weightcol))

    if hist._weightoriginal is None:
        df2 = df.select(*selectcols)
    else:
        selectcols.append(alias(weightcol))
        selectcols.append(alias(weightcol * weightcol))
        df2 = df.select(*selectcols)

    aggs = [fcns.sum(df2[n]) for n in df2.columns[1:]]
    if hist._weightoriginal is None:
        aggs.append(fcns.count(df2[df2.columns[0]]))

    def getornew(content, key, nextaxis):
        if key in content:
            return content[key]
        elif isinstance(nextaxis, histbook.axis.GroupAxis):
            return {}
        else:
            return numpy.zeros(hist._shape, dtype=histbook.hist.COUNTTYPE)

    def recurse(index, columns, axis, content):
        if len(axis) == 0:
            content += columns

        elif isinstance(axis[0],
                        (histbook.axis.groupby, histbook.axis.groupbin)):
            content[index[0]] = recurse(
                index[1:], columns, axis[1:],
                getornew(content, index[0],
                         axis[1] if len(axis) > 1 else None))
            if isinstance(axis[0], histbook.axis.groupbin) and None in content:
                content["NaN"] = content[None]
                del content[None]

        elif isinstance(
                axis[0],
            (histbook.axis.bin, histbook.axis.intbin, histbook.axis.split)):
            i = index[0] - (1 if not axis[0].underflow else 0)
            if int(i) < axis[0].totbins:
                recurse(index[1:], columns, axis[1:], content[int(i)])

        elif isinstance(axis[0], histbook.axis.cut):
            recurse(index[1:], columns, axis[1:],
                    content[0 if index[0] else 1])

        else:
            raise AssertionError(axis[0])

        return content

    query = df2.groupBy(df2[df2.columns[0]]).agg(*aggs)

    def wait():
        for row in query.collect():
            recurse(row[0], row[1:], hist._group + hist._fixed, hist._content)

    return wait
Example #23
0
def run(w=14, l=114, threshold=0):
    # Creating Data Frame and filtering by threshold
    pattern, k = random_pattern(l, w), w

    Chiaromonte = [[91, -114, -31, -123], [-114, 100, -125, -31],
                   [-31, -125, 100, -114], [-123, -31, -114, 91]]

    spark = SparkSession.builder.appName('Distributed FSWM').getOrCreate()

    df = spark.read.text("data/example.fasta")

    # Read the sequences
    sequences = df.where(~df.value.contains('>')).rdd.map(list).map(
        lambda x: (x[0].encode('ascii'))).map(list)

    # Defining schema for data frame
    schema = StructType([
        StructField("id", IntegerType()),
        StructField("Sequence", ArrayType(StringType()))
    ])

    df = spark.createDataFrame(
        (tuple([_id, data[0]])
         for _id, data in enumerate(map(lambda x: [x], sequences.take(2)))),
        schema=schema)

    # Creating ngrams
    ngram = NGram(n=w, inputCol="Sequence", outputCol="ngrams")
    df_clean = ngram.transform(df).select(["id", "ngrams"])

    # Exploding ngrams into the data frame
    df_explode = df_clean.withColumn('ngrams', explode('ngrams'))

    # Defining the reducer
    # Create your UDF object (which accepts your python function called "my_udf")
    udf_object = udf(lambda y: reducer_concat(y), IntegerType())

    # Here we should have for all the sequences

    df_w0 = df_explode.where(df_clean.id == 0)
    df_w0 = df_w0.withColumn("id0",
                             monotonically_increasing_id() +
                             1).withColumnRenamed('ngrams',
                                                  'w0').select('id0', 'w0')
    df0 = df_w0.withColumn("word0",
                           udf_object(df_w0.w0)).select("id0", "word0")
    df0.show()

    df_w1 = df_explode.where(df_clean.id == 1)
    df_w1 = df_w1.withColumn("id1",
                             monotonically_increasing_id() +
                             1).withColumnRenamed('ngrams',
                                                  'w1').select('id1', 'w1')
    df1 = df_w1.withColumn("word1",
                           udf_object(df_w1.w1)).select("id1", "word1")
    df1.show(truncate=False)

    df_result = df0.crossJoin(df1) \
        .withColumn("spaced_word", udf_spaced_words(pattern)(col("word0"), col("word1"))) \
        .where(col("spaced_word").isNotNull()) \
        .withColumn("score", udf_score(pattern, k, Chiaromonte)(col("word0"), col("word1"))) \
        .where(col("score") > threshold) \
        .orderBy(["spaced_word", "score"], ascending=False) \
        .withColumn("min", least(col("id0"), col("id1"))) \
        .withColumn("max", greatest(col("id0"), col("id1"))) \
        .drop_duplicates(subset=["spaced_word", "min"]) \
        .drop_duplicates(subset=["spaced_word", "max"]) \
        .withColumn("JukesCantor", udf_jukes_cantor(pattern, k)(col("word0"), col("word1")))

    df_result.show()

    p = df_result.agg(suma("JukesCantor")).collect()[0][0] * 1.0 / (
        (k - bin(pattern).count("1") / 2) * df_result.count())

    print(JukesCantor(p))
def amend_device_tracking(observations_df, tracking_df, last_updated_by):  # type: (DataFrame, DataFrame, str) -> typing.Tuple[DataFrame, DataFrame, DataFrame]
    """
    Blends new observations into an existing device tracking dataset.

    :param observations_df: New observations to be used for amending the device tracking data set.
    :param tracking_df: The device tracking data set.
    :param last_updated_by: The last updated user/process tracking field
    :return: A 3-tuple of (modified device tracking records only, updated full device tracking data set, device tracking records for never-before-seen devices only)
    """
    observations_df = observations_df.alias('o')
    tracking_df = tracking_df.alias('t')

    pk = ['organization', 'mac']

    # Find the tracking records that are changed by the new observations.

    delta_df = observations_df.select(
        'organization',
        'mac',
        'first_observed_at',
        'last_observed_at'
    ).join(
        tracking_df,
        on=pk,
        how='left'
    ).where(
        col('t.mac').isNull()
        | (col('o.first_observed_at') < col('t.first_observed_at'))
        | (col('o.last_observed_at') > col('t.last_observed_at'))
    ).select(
        'o.organization',
        'o.mac',
        least('o.first_observed_at', 't.first_observed_at').name('first_observed_at'),
        greatest('o.last_observed_at', 't.last_observed_at').name('last_observed_at'),
    ).cache()

    # Create a new version of the entire device tracking dataset, and checkpoint it to break the cyclic lineage
    # caused by reading from and writing to the same table.

    refresh_df = tracking_df.join(
        delta_df,
        on=pk,
        how='left_anti'  # Retain only the unmodified records.
    ).unionByName(
        delta_df.select(
            '*',
            current_timestamp().name('last_updated_at'),
            lit(last_updated_by).name('last_updated_by')
        )
    ).coalesce(
        1
    ).checkpoint(
        eager=True
    )

    # Find any never-before-seen devices.

    new_devices_df = delta_df.join(
        tracking_df,
        on=pk,
        how='left_anti'
    ).cache()

    return delta_df, refresh_df, new_devices_df
Example #25
0
    sqlcontext.registerDataFrameAsTable(df7, "df")
    df8 = sqlcontext.sql("""SELECT df.playername
			            , MAX(CASE WHEN df.prediction = 'Zone1' THEN df.hitrate END) AS Zone1hitrate
		  	            , MAX(CASE WHEN df.prediction = 'Zone2' THEN df.hitrate END) AS Zone2hitrate
			            , MAX(CASE WHEN df.prediction = 'Zone3' THEN df.hitrate END) AS Zone3hitrate
		    	            , MAX(CASE WHEN df.prediction = 'Zone4' THEN df.hitrate END) AS Zone4hitrate
			         FROM df
		             GROUP BY df.playername""")

    #adding a column with the highest hitrate to the previous table
    df9 = df8.select(df8.playername \
    , df8.Zone1hitrate \
    , df8.Zone2hitrate \
    , df8.Zone3hitrate \
    , df8.Zone4hitrate \
    , greatest("Zone1hitrate","Zone2hitrate","Zone3hitrate","Zone4hitrate").alias("besthitrate"))

    #using the besthitrate column to determine the best zone for each player
    df10 = df9.withColumn("bestzone",when(df9.Zone1hitrate == df9.besthitrate, "Zone1") \
      .when(df9.Zone2hitrate == df9.besthitrate, "Zone2") \
      .when(df9.Zone3hitrate == df9.besthitrate, "Zone3") \
      .when(df9.Zone4hitrate == df9.besthitrate, "Zone4"))

    sqlcontext.registerDataFrameAsTable(df10, "df")
    print(
        "The NBA player have been classified into four confortable zones, with the following structure: [shotclock,shotdist,closedefdist]"
    )
    print("The four zones are:")
    print("Zone1: %s" % (centroid1))
    print("Zone2: %s" % (centroid2))
    print("Zone3: %s" % (centroid3))
Example #26
0
df = sqlContext.read.format('parquet').load(
    'hdfs:/scholar_data/tokens_count_by_year.parquet')

# keep only tokens starting from 3 characters in length
df = df.filter('LENGTH(entities) > 2')

# gather column names linked to years
col_years = [col_name for col_name in df.columns]
col_years.remove('entities')

# Find peak usage of token across the years
# https://stackoverflow.com/questions/40874657/pyspark-compute-row-maximum-of-the-subset-of-columns-and-add-to-an-exisiting-da
minf = F.lit(float("-inf"))
df = df.withColumn(
    "year_max",
    F.greatest(*[F.coalesce(F.col(year), minf) for year in col_years]))

# forget about tokens that have never been really used
df = df.filter("year_max > 10").drop('year_max')

# find total number of "valid" tokens used on each year
df = df.join(df.groupby().sum(*col_years))

# retrieve token frequency (times common coefficient) for each year
# coefficient is to make sure we do not limitations of float precision too hard
for year in col_years:
    df = df.withColumn(year, 100000.0 * F.col(year) /
                       F.col(f'sum({year})')).drop(f'sum({year})')

# store results
df.write.save('hdfs:/scholar_data/tokens_freq_by_year.parquet',
Example #27
0
    def startCalculation(self):
        spark = self.sparkSession
        sc = spark.sparkContext

        #cache dataframes
        tw = self.tweet_df
        tw.cache()
        u = self.user_df
        u.cache()

        # calculate duration of dataset
        dates = tw.select('created_at').rdd.map(
            lambda r: convert_twitter_date(r[0])).collect()
        duration_of_dataset = self.get_duration_of_dataset(dates)

        # list of names
        self.list_screen_names = u.select('screen_name').rdd.map(
            lambda r: r[0]).distinct().collect()

        #list of categories
        list_categories = tw.select("category").rdd.map(
            lambda r: r[0]).distinct().collect()
        list_categories.sort()
        self.list_categories = list_categories

        # format dates and remove hour info
        format_dates = udf(convert_twitter_date_noHour, DateType())
        updated_tweet_df = tw.withColumn("formatted_date",
                                         format_dates(tw["created_at"]))

        #calculate tweets count of all users
        joined_df = u.join(updated_tweet_df,
                           u.id == updated_tweet_df.userId,
                           how='left')
        tweets_total = joined_df.groupBy("id").count().orderBy(
            'count',
            ascending=False).withColumnRenamed("count", "tweets_total")

        #calculate tweets count of all users by topic
        tweets_by_topic = joined_df.groupBy("id").pivot(
            "category").count().fillna(0, subset=list_categories)
        tweets_by_topic_nested = tweets_by_topic.select(
            "id",
            struct(list_categories).alias("dict_tweet_by_topic"))

        #calculate days posted of all users by topic
        days_posted_by_topic = joined_df.groupBy(
            "id", "formatted_date").pivot("category").count().fillna(
                0, subset=list_categories)
        for cat in list_categories:
            days_posted_by_topic = days_posted_by_topic.withColumn(
                cat,
                when(days_posted_by_topic[cat] > 0, 1).otherwise(0))
        days_posted_by_topic_summed = days_posted_by_topic.groupBy("id").agg(
            *[sum(c).alias(c) for c in list_categories])
        days_posted_by_topic_nested = days_posted_by_topic_summed.select(
            "id",
            struct(list_categories).alias("dict_days_posted_by_topic"))

        #join tweets_total, tweets_by_topic_nested and, days_posted_by_topic_nested
        temp_u = tweets_total.join(tweets_by_topic_nested,
                                   "id").join(days_posted_by_topic_nested,
                                              "id")

        #calculate focus rate
        for cat in list_categories:
            temp_u = temp_u.withColumn(
                cat,
                col("dict_tweet_by_topic.{}".format(cat)) /
                greatest(lit(1), col("tweets_total")))
        temp_u = temp_u.select(
            "id", "tweets_total", "dict_tweet_by_topic",
            "dict_days_posted_by_topic",
            struct(list_categories).alias("dict_focus_rate"))

        #calculate activeness1
        for cat in list_categories:
            temp_u = temp_u.withColumn(
                cat,
                col("dict_days_posted_by_topic.{}".format(cat)) /
                duration_of_dataset)
        temp_u = temp_u.select(
            "id", "tweets_total", "dict_tweet_by_topic",
            "dict_days_posted_by_topic", "dict_focus_rate",
            struct(list_categories).alias("dict_activeness_1"))

        #calculate activeness2
        for cat in list_categories:
            temp_u = temp_u.withColumn(
                cat,
                col("dict_tweet_by_topic.{}".format(cat)) /
                duration_of_dataset)
        temp_u = temp_u.select(
            "id", "tweets_total", "dict_tweet_by_topic",
            "dict_days_posted_by_topic", "dict_focus_rate",
            "dict_activeness_1",
            struct(list_categories).alias("dict_activeness_2"))

        #calculate activeness3
        for cat in list_categories:
            temp_u = temp_u.withColumn(
                cat,
                col("dict_tweet_by_topic.{}".format(cat)) *
                col("dict_days_posted_by_topic.{}".format(cat)) /
                duration_of_dataset)
        temp_u = temp_u.select(
            "id", "tweets_total", "dict_tweet_by_topic",
            "dict_days_posted_by_topic", "dict_focus_rate",
            "dict_activeness_1", "dict_activeness_2",
            struct(list_categories).alias("dict_activeness_3"))

        #set results
        self.results_df = temp_u.select(
            "id",
            struct(temp_u.columns[1:]).alias("user_features{}".format((
                "_" + self.method_name) if len(self.method_name) > 0 else "")))
Example #28
0
 def max(df, cols: List[str]):
     return df.select(F.greatest(cols))
Example #29
0
##Jas_Labour_Production_df.select('HOURS_BILLABLE').distinct().show()
Jas_Labour_Production_df=Jas_Labour_Production_df.filter(Jas_Labour_Production_df['TRANSACTION_DATE1'] >= lit("2019-01-01"))\
.filter(Jas_Labour_Production_df['TRANSACTION_DATE1'] <= lit("2019-02-21"))
##Jas_Labour_Production_df.printSchema()

##Performing join operation between two given sources and storing it in an intermediate dataframe
Jas_Labour_Interm_df=Jas_Labour_Paid_df.join(Jas_Labour_Production_df,Jas_Labour_Paid_df.KEY_COL==Jas_Labour_Production_df.KEY_COL1,'inner')\
.select('TRANSACTION_DATE','EMPLOYEE_NUMBER','KEY_COL','EMPLOYEE_NAME','PROGRAM_DESC','AVAIL_OT_HRS','WEEK_NUMBER','DIRECT_INDIRECT',\
'HOURS_TOTAL',Jas_Labour_Production_df['PROD_REG_HRS'],Jas_Labour_Production_df['PROD_OT_HRS'],Jas_Labour_Production_df['HOURS_BILLABLE'])

##Final dataframe with the required columns
Jas_Labour_Final_df=Jas_Labour_Interm_df\
.withColumn('AVAIL_REG_HRS', when(col('HOURS_BILLABLE') !=0, col('HOURS_BILLABLE')).otherwise(0))\
.withColumn('EXP_REG_HRS',\
when(((col('HOURS_TOTAL') !=0) & (col('PROD_REG_HRS')!=0)),greatest(col('HOURS_TOTAL'),col('PROD_REG_HRS'))).when(((col('HOURS_TOTAL')!=0) & (col('PROD_REG_HRS') == 0)),col('HOURS_TOTAL'))\
.when(((col('HOURS_TOTAL')==0) & (col('PROD_REG_HRS') !=0)), col('PROD_REG_HRS')))\
.withColumn('RATE_TYPE', lit('JAS'))\
.withColumn('RATE_VALUE', lit(80))\
.withColumn('COMPANY_CODE', lit(1))\
.withColumn('SOURCE', lit('Quantum'))

##Jas_Labour_Final_df.printSchema()
Jas_Labour_Final_df=Jas_Labour_Final_df.select('TRANSACTION_DATE','EMPLOYEE_NUMBER',col('DIRECT_INDIRECT').alias('JOB_CATEGORY'),\
'PROGRAM_DESC','COMPANY_CODE',col('WEEK_NUMBER').alias('WEEK_NO'),'SOURCE',col('HOURS_TOTAL').alias('TOTAL_REG_HOURS'),\
col('AVAIL_OT_HRS').alias('TOTAL_OT_HOURS'),col('PROD_REG_HRS').alias('CHARGEABLE_REG_HOURS'),col('PROD_OT_HRS').alias('CHARGEABLE_OT_HOURS'),\
col('EXP_REG_HRS').alias('NON_CHARGEABLE_REG_HOURS'))\
.withColumn('EMPLOYEE_BASE', lit('N/A'))\
.withColumn('BASE_DESCRIPTION', lit('N/A'))\
.withColumn('SHOP_CODE', lit(0))\
.withColumn('SHOP_DESCRIPTION', lit('N/A'))\
Example #30
0
def Validate(ngrams \
			, sampleSizes \
			, ctxSize \
			, sqc \
			, seqs \
			, outFile \
			, minval \
			, maxval \
			, avg \
			, nlines):

	accuracy = []
	gramSize = GramSize(ctxSize, lookahead)

	c1 = (((maxval - minval) * 1.0) / nlines) / avg
	c2 = ((minval * 1.0) / nlines) / avg
	print seqs.count()
				


	ngrams = ngrams.repartition(1 << nPartLog)
	ngrams.cache()

	#we will validate separately for each vector size
	for vecSize in vecSizes:
		print '======TESTING FOR VECTOR SIZE', vecSize
		#start fresh
		old_ngrams = ngrams
		ngrams = ngrams.withColumn('correct', lit(0))



		#use models from each sample
		modelId = 0
		for sampleSize in sampleSizes:

			w2v = Word2VecModel.load(w2vFile(outDir, ctxSize, sampleSize, vecSize))
			lrmodels = []
			for dim in range(0, vecSize):
				lrmodels.append(LinearRegressionModel.load(lrmFile(outDir, ctxSize, sampleSize, vecSize, dim)))

			success = 0
			fail = 0
			unopt = 0

			#add columns to store model success and failure
			modelSucc = 'succ_' + str(modelId)
			modelFail = 'fail_' + str(modelId)
			modelUnopt = 'unopt_' + str(modelId)
			seqs = seqs.withColumn(modelSucc, lit(0)) \
						.withColumn(modelFail, lit(0)) \
						.withColumn(modelUnopt, lit(0))
			modelId = modelId + 1



			ngrams = ngrams \
				.withColumn('predSeq', lit(''))

			#create initial feature vector
			#transform each word into a cluster center
			words, d, centers = ClusterWords(w2v \
											, seqs \
											)
		
			#record correctness for this model only
			old_ngrams = ngrams
			ngrams = ngrams.withColumn('sample_correct', lit(0)).withColumn('sample_confi', lit(1.0))

			for nextPos in range(0,lookahead):
				#build the feature vector
				ngrams = BuildSubstringFeature(ngrams, w2v, nextPos, nextPos + ctxSize, ctxSize, lookahead,)

				#build the prediction vector
				ngrams = BuildPredictionVector(ngrams, lrmodels, ctxSize, vecSize)


			

				#now assign a cluster id to each prediction vector
				old_ngrams = ngrams
				ngrams = centers.transform(ngrams).withColumnRenamed('cluster', 'predWord').withColumnRenamed('vector', 'predictionVector')
				
				
				#get the predicted word
				ngrams = ngrams.join(broadcast(words), words.cluster == ngrams.predWord, 'inner') \
								.drop('cluster') #\

				#calculate the cosine similarity between prediction vector and center vector 
				epsilon = 0.0001
				def CosineSimi (v1, v2):
					d1 = DenseVector(v1)
					d2 = DenseVector(v2)
					n1 = d1.norm(2)
					n2 = d2.norm(2)
					return float(d1.dot(d2) / (n1 * n2))
				cossim = udf(lambda v1, v2: CosineSimi(v1, v2), DoubleType())
				ngrams = ngrams.withColumn('simi', cossim('centerVector', 'predictionVector'))
				ngrams = ngrams.drop('centerVector').drop('predictionVector')


				#update predicted sequence
				ngrams = ngrams.withColumn('predSeq', concat_ws(' ', 'predSeq', 'word')) 
				ngrams = ngrams.withColumn('predSeq', ltrim(ngrams.predSeq))


				#get actual sequence
				ngrams = CreateSubstring(ngrams, 'sentence', 'actualSeq', gramSize, ' ', ctxSize, ctxSize + nextPos + 1)


				#now get the cluster id for the predicted word in the sentence
				ngrams = BuildLabelVector(ngrams, w2v, ctxSize, lookahead, nextPos).withColumnRenamed('labelVec', 'vector').drop('ngrams')
				ngrams = centers.transform(ngrams).drop('vector')

				#and host latency for actual word
				ngrams = ngrams.join(broadcast(words), 'cluster', 'inner') \
						.drop('word') \
						.drop('centerVector') #\
				
				
			
				#record correctness
				ngrams = ngrams.withColumn('round_correct', when((ngrams.predWord != ngrams.cluster) | (ngrams.simi < confidence), 0).otherwise(nextPos + 1)).drop('predWord').drop('cluster')
				ngrams = ngrams.withColumn('sample_correct', when(ngrams.sample_correct + 1 == ngrams.round_correct, ngrams.round_correct).otherwise(ngrams.sample_correct)) 




				#get overall correctness
				ngrams = ngrams.withColumn('correct', greatest('sample_correct', 'correct'))

				#get binary correctness
				ngrams = ngrams.withColumn('binary_correct', when(ngrams.correct >= nextPos + 1, 1).otherwise(0))
				ngrams = ngrams.withColumn('sample_confi', when(ngrams.binary_correct == 1, 1.0).otherwise(least(ngrams.simi, ngrams.sample_confi)))
				ngrams = ngrams.withColumn('simi', when(ngrams.binary_correct == 1, ngrams.simi).otherwise(ngrams.sample_confi))


				ngrams = ngrams.withColumn('predSeq', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), ngrams.actualSeq).otherwise(ngrams.predSeq))
				ngrams = ngrams.withColumn('succ_wt', when(ngrams.binary_correct == 1, ngrams.wt).otherwise(0))
				ngrams = ngrams.withColumn('fail_wt', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), 0).otherwise(ngrams.wt))
				ngrams = ngrams.withColumn('unopt_wt', when((ngrams.binary_correct == 0) & (ngrams.simi < confidence), ngrams.wt).otherwise(0))
				ngrams = ngrams.drop('simi')

				#now summarize success and failure rates by predicted sequence
				seqWts = ngrams.groupBy('predSeq').agg(sum('succ_wt').alias('succ_wt'), sum('fail_wt').alias('fail_wt'), sum('unopt_wt').alias('unopt_wt'))

				#update sequences table
				seqs = seqWts.join(broadcast(seqs), seqWts.predSeq==seqs.word, 'right_outer').drop('predSeq').fillna(-c2/c1, ['succ_wt', 'fail_wt', 'unopt_wt'])


				scaleback = udf(lambda s: float(s*c1 + c2), DoubleType())
				seqs = seqs.withColumn(modelSucc, col(modelSucc) + scaleback(seqs.succ_wt)).drop('succ_wt')
				seqs = seqs.withColumn(modelFail, col(modelFail) + scaleback(seqs.fail_wt)).drop('fail_wt')
				seqs = seqs.withColumn(modelUnopt, col(modelUnopt) + scaleback(seqs.unopt_wt)).drop('unopt_wt')
				seqs.cache()

				aggregated = seqs.agg(sum(modelSucc), sum(modelFail), sum(modelUnopt))
				aggregated.cache()
				new_success = aggregated.head()['sum(' + modelSucc + ')']
				new_fail = aggregated.head()['sum(' + modelFail + ')']
				new_unopt = aggregated.head()['sum(' + modelUnopt + ')']
				print nextPos, new_success - success, new_fail - fail, new_unopt - unopt 
				success = new_success
				fail = new_fail
				unopt = new_unopt


		#end for testing for each model for a particular vector size

	#end for each vector size


	seqs.orderBy('succ_0', ascending=False).write.mode('overwrite').csv(outputFile(outDir, ctxSize, vecSize, sampleSizes))


	return accuracy