Esempio n. 1
0
def analysis_6(units_df, primary_person_df, log):
    """In two ways found the result for Query 6.
    :param units_df: DataFrame Units_use.
    :param primary_person_df: DataFrame Primary_Person_use.
    :param log: Logger.
    :return None
    """
    joined = units_df.alias("U").join(primary_person_df.alias("P"), col("P.CRASH_ID") == col("U.CRASH_ID"))
    crashes_by_zip = (
        joined
            .filter(col("PRSN_ALC_RSLT_ID") == "Positive")
            .filter(~col("VEH_BODY_STYL_ID").contains("MOTORCYCLE"))
            .filter(col("DRVR_ZIP").isNotNull())
            .groupBy("DRVR_ZIP").agg(size(collect_set(col("P.CRASH_ID"))).alias("CRASHES"))
    )
    zip_ordered_by_crashes = crashes_by_zip.withColumn("rnk", dense_rank().over(Window.orderBy(desc("CRASHES"))))
    top5_zip = zip_ordered_by_crashes.filter(col("rnk") < 6).select("DRVR_ZIP", "CRASHES")

    # APPROACH - 2 BY CONTRIBUTING FACTOR
    contributing_factor_alcohol = (
        joined
            .filter(col("CONTRIB_FACTR_1_ID").contains("ALCOHOL") | col(
            "CONTRIB_FACTR_2_ID").contains("ALCOHOL") | col("CONTRIB_FACTR_P1_ID").contains("ALCOHOL"))
            .filter(col("DRVR_ZIP").isNotNull())
            .groupBy("DRVR_ZIP").agg(size(collect_set(col("P.CRASH_ID"))).alias("CRASHES"))
    )
    zip_by_crashes = contributing_factor_alcohol.withColumn("rnk", dense_rank().over(Window.orderBy(desc("CRASHES"))))
    top5 = zip_by_crashes.filter(col("rnk") < 6).select("DRVR_ZIP", "CRASHES")

    log.warn("Results for Query 6")
    top5_zip.show(10, False)
    top5.show(10, False)

    return None
Esempio n. 2
0
def prepare_firmware_cve_counts(firmware_cves_df: DataFrame,
                                firmware_hashes_df: DataFrame) -> DataFrame:
    # yapf: disable
    # Ensure that the windows for each of low, med, hi, and crit are over the entire firmware space instead of just
    # those for which a CVE is known to exist
    firmware_cves_full_df = firmware_hashes_df.join(
        firmware_cves_df,
        'firmware_hash',
        'left'
    ).na.fill(0)

    low_window = Window.orderBy(firmware_cves_full_df['low'])
    med_window = Window.orderBy(firmware_cves_full_df['medium'])
    high_window = Window.orderBy(firmware_cves_full_df['high'])
    crit_window = Window.orderBy(firmware_cves_full_df['critical'])
    # sum(wi*xi)/sum(wi)
    cve_composite_score = (percent_rank().over(low_window) + (2 * percent_rank().over(med_window)) + (3 * percent_rank().over(high_window)) + (4 * percent_rank().over(crit_window))) / 10

    fwc_with_score_df = firmware_cves_full_df.withColumn(
        'firmware_cve_component_score', cve_composite_score
    ).select(
        'firmware_hash',
        'firmware_cve_component_score'
    )
    # yapf: enable
    return fwc_with_score_df
Esempio n. 3
0
 def initialize_edges(self, vertices):
     src = vertices.select(F.col("id")).orderBy(F.rand()).limit(self.nbr_edges).withColumnRenamed("id", "src") \
     .withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
     
     src.createOrReplaceTempView("src")
     vertices.createOrReplaceTempView("vertices")
     
     query = self.spark.sql("select vertices.id from vertices minus select src.src from src")
     
     dst = query.orderBy(F.rand()).limit(self.nbr_edges).withColumnRenamed("id", "dst") \
     .withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
     
     self.edges = src.join(dst, src.id == dst.id).select(F.col('src'), F.col('dst')).persist(StorageLevel.MEMORY_AND_DISK)
     return self.edges
def accomodate_people(df:DataFrame):
    """[calculate accommodation possibilities for the lowest price and highest rating]

    Args:
        df (DataFrame): [spark dataframe]
    """
    (df.withColumn("rank_ranking",f.rank().over(Window.orderBy(f.desc("review_scores_value"))))
        .withColumn("price_ranking", f.rank().over(Window.orderBy("price")))
        .filter((f.col('rank_ranking') == 1) & (f.col("price_ranking")==1))
        .select(f.col("accommodates"))
        .coalesce(1)
        .write
        .option("header","true")
        .format('csv')
        .save('out/out_2_4.txt'))
Esempio n. 5
0
def main(sc, out_file_name):
    """
    Read GDELT data from S3, count occurrence of news sources,
    determine the top 100 most frequent ones, and write list to
    out_file_name
    """

    #Read 'GKG" table from GDELT S3 bucket. Transform into RDD
    gkgRDD = sc.textFile('s3a://gdelt-open-data/v2/gkg/2018*.gkg.csv')
    gkgRDD = gkgRDD.map(lambda x: x.encode("utf", "ignore"))
    gkgRDD.cache()
    gkgRDD = gkgRDD.map(lambda x: x.split('\t'))
    gkgRDD = gkgRDD.filter(lambda x: len(x) == 27)
    gkgRDD = gkgRDD.filter(lambda x: f.is_not_empty([x[3]]))
    gkgRowRDD = gkgRDD.map(lambda x: Row(src_common_name=x[3]))

    sqlContext = SQLContext(sc)

    #Transform RDDs to dataframes
    gkgDF = sqlContext.createDataFrame(gkgRowRDD)

    #Frequency count for each source
    srcDF = gkgDF.select('src_common_name').groupBy('src_common_name').agg(
        count('*').alias('count'))

    #Select top 100 most frequent sources, and write to output file
    window = Window.orderBy(srcDF['count'].desc())
    rankDF = srcDF.select(
        '*',
        rank().over(window).alias('rank')).filter(col('rank') <= 100).where(
            col('src_common_name') != '')
    pandasDF = rankDF.toPandas()
    pandasDF.to_csv(out_file_name,
                    columns=["src_common_name", "count", "rank"])
Esempio n. 6
0
    def test_window_functions_without_partitionBy(self):
        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        w = Window.orderBy("key", df.value)
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.rowNumber().over(w),
            F.rank().over(w),
            F.denseRank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 4, 1, 1, 1, 1),
            ("2", 1, 1, 1, 4, 2, 2, 2, 1),
            ("2", 1, 2, 1, 4, 3, 2, 2, 2),
            ("2", 2, 2, 2, 4, 4, 4, 3, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[: len(r)])
def main():
    spark, sc = spark_init()

    schemaDF = StructType([
        StructField("empid", StringType(), True),
        StructField("name", StringType(), True),
        StructField("rank", DoubleType(), True),
        StructField("salary", StringType(), True),
        StructField("mgrid", StringType(), True)
    ])

    empDfBr = spark.read\
    .option("header","true")\
    .schema(schemaDF)\
    .option("sep",",")\
    .format("csv")\
    .load(r"C:\data\retail_db\order_items\employee2.txt")

    #broadcastEmpDfBr = sc.broadcast(empDfBr)

    df1 = F.broadcast(empDfBr).alias("a").join(F.broadcast(empDfBr).alias("b"),\
          F.col("a.mgrid")==F.col("b.empid"),"leftouter")\
    .select(F.col("a.*"), F.col("b.name").alias("manager_name"))

    df2 = df1.select("*").agg(F.col("mgr_id")).select("*").take(1)


    df1.withColumn("total_salary", F.sum(F.col("salary")).over(Window.partitionBy(F.col("manager_name"))))\
    .withColumn("rownum", F.row_number().over(Window.orderBy(F.col("total_salary").desc())))\
     .filter(F.col("rownum")==1).select("mgrid","manager_name").show()

    df1.groupBy(F.col("mgrid")).agg(F.max(F.col("salary")).alias("max_sal")).filter(F.col("mgrid").isNotNull())\
        .select("mgrid","max_sal").collect()
Esempio n. 8
0
def city_process(input_data, output_data, spark):
    demo_df = spark.read.format('csv').load(os.path.join(
        input_data, 'demographics/*.csv'),
                                            header=True,
                                            inferSchema=True,
                                            sep=';')

    #cut down table and rename columns
    demo_df = demo_df.select('City','State Code')\
                     .withColumnRenamed('City', 'city')\
                     .withColumnRenamed('State Code', 'state_code')

    #read in airport data to get us cities
    df_air = spark.read.format('csv').load(os.path.join(
        input_data, 'airports/*.csv'),
                                           header=True,
                                           inferSchema=True)

    #filter down to only US cities
    us_air = df_air.filter(df_air.iso_country == 'US')

    #apply function and rename municipality
    us_air = us_air.withColumn('state_code', region_state(col('iso_region')))\
                   .withColumnRenamed('municipality', 'city')\
                   .select('city', 'state_code')

    #combine the two dfs together to create the final city table
    city = us_air.union(demo_df)\
          .drop_duplicates()\
          .withColumn('city_Id', F.monotonically_increasing_id())\
          .withColumn('city_id,', F.row_number().over(W.orderBy('city_Id')))\
          .select('city_id', 'city', 'state_code')

    #write final df to s3 processed path
    city.write.mode('overwrite').parquet(os.path.join(output_data, 'city/'))
Esempio n. 9
0
def als_model():

    user_inventory = spark.sql("SELECT * FROM userinfo").filter(
        'playtime_forever > 0')
    ratings = user_inventory.withColumn(
        "user",
        f.dense_rank().over(Window.orderBy("userid")))
    correspond = ratings.select('userid', 'user').dropDuplicates()

    als = ALS(userCol="user", itemCol="appid", ratingCol="playtime_forever")
    model = als.fit(ratings)
    model.save("als_model")
    top20 = model.recommendForAllUsers(20)

    recommend = top20.join(correspond, top20.user == correspond.user).select(
        'userid', 'recommendations')
    recommendList = recommend.rdd.map(
        lambda x: (x[0], [appid[0] for appid in x[1]])).collect()

    for r in recommendList:
        userid = r[0]
        idList = r[1]
        spark.sql("INSERT INTO als_top20 ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s',\
						    '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')"                                                                          % \
             (userid, idList[0], idList[1], idList[2], idList[3], idList[4], idList[5], \
               idList[6], idList[7], idList[8], idList[9], idList[10], idList[11], \
                      idList[12], idList[13], idList[14], idList[15], idList[16], \
                      idList[17], idList[18], idList[19]))
Esempio n. 10
0
def generate_aggregate_player_data(shots_fixed):
    """ Generates aggregate information and unique count for NBA players based on shots taken

    Parameters
    ----------
    shots_fixed: Dataframe containing NBA player shot data and the following columns:
        'GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_NAME',
        'EVENT_TYPE', 'LOC_X', 'LOC_Y', and 'SHOT_DISTANCE'

    Returns
    -------
    Aggregated player data

    Notes
    -----
    """

    # filter to relevant columns
    result =  shots_fixed.select('GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_NAME', 'EVENT_TYPE', 'LOC_X', 'LOC_Y', 'SHOT_DISTANCE')

    # categorize shots
    result = result.withColumn('EVENT_TYPE', F.when(F.col('EVENT_TYPE')== 'Missed Shot', 0).otherwise(1))

    # aggregate all player data
    result = result.groupBy('PLAYER_ID', 'PLAYER_NAME').pivot('EVENT_TYPE').count()
    result = result.withColumnRenamed('0', 'missed_shot').withColumnRenamed('1', 'made_shot')

    # sort and add unique numerical id for training in TensorFlow
    w1 = Window.orderBy("PLAYER_ID")
    result = result.withColumn('rank', F.rank().over(w1))
    result = result.withColumn('rank', F.col('rank')- 1)
    
    return result
Esempio n. 11
0
def train_test_split(full_data, country, feature_list, time_horizon):
    target_label = country + "_cases"
    prev = country + "_1lag"
    w = Window.orderBy(col("Date"))
    # Assemble feature vector where Canada cases are the target
    vectorAssembler = VectorAssembler(inputCols=feature_list,
                                      outputCol='features')
    vdata = vectorAssembler.transform(full_data) \
      .withColumnRenamed(target_label, "actual_orig") \
      .withColumn("actual", lead(col("actual_orig"), time_horizon).over(w)) \
      .withColumn("diff", col("actual_orig")-col(prev)) \
      .withColumn("actual_diff", lead(col("diff"), time_horizon).over(w)) \
      .filter(col("actual_diff").isNotNull())

    train = vdata.select(
        ['features', "Date", "actual_orig", "diff", "actual",
         "actual_diff"]).filter(col("Date") < SPLIT_DT)
    test = vdata.select(
        ['features', "Date", "actual_orig", "diff", "actual",
         "actual_diff"]).filter(col("Date") >= SPLIT_DT)

    print("Total days: " + str(vdata.count()))
    print("Total days for train dataset: " + str(train.count()))
    print("Total days for test dataset: " + str(test.count()))
    return train, test
Esempio n. 12
0
def data_reduction(table, limite_superior, limite_inferior, KPI):
  data_alg = table.select('date_time', 'sector_id', KPI).withColumn('condition', table[KPI].between(95,100))
  h = data_alg.withColumn('numero', F.when(data_alg['condition'] == True, 1).otherwise(-1))
#Guardamos una columna con el valor numerico de True y False     
  df_lag = h.withColumn('estado_anterior',
                        F.lag(h['numero'])
                                 .over(Window.orderBy('sector_id')))
#Guardamos el valor anterior al valor de observación, 
#para saber su estado anterior
  result = df_lag.withColumn('derivada',
          (df_lag['numero'] - df_lag['estado_anterior']))
#Calculamos la derivada con la resta del valor actual
#y del estado anterior
  g = result.withColumn('Start', F.when(result['derivada'] == -2,result.date_time ))
#El comienzo de la degradación es donde la derivada
#tiene cierto valor negativo, guardandose el date time
  s = g.withColumn('End', F.when(g['derivada'] == 2,result.date_time ))
#El fin de la degradación es donde la derivada
#tiene cierto valor positivo, guardandose el date time
  PERIODO = s.select(s.Start, s.End).dropna(how = 'all')
  Start = PERIODO.select(PERIODO.Start).dropna(how = 'any')
  End = PERIODO.select(PERIODO.End).dropna(how = 'any')
 
  final = Start.withColumn('End', End.End)
 # final = Start.withColumn()
  #final = [Start.toPandas(), End.toPandas()]
  #resultado=  F.concat(Start, End)
  #Result = pd.concat(final, axis = 1)
  #print(Result)
  display(final)
Esempio n. 13
0
def getClusterData(amenities):
    n_clust = 5
    x = amenities.select('lat', 'lon').collect()
    model = KMeans(n_clusters=n_clust, random_state=353).fit(x)
    clusters = model.predict(x)
    cluster = clusters.tolist()

    centres = model.cluster_centers_

    # convert list to a dataframe
    df = sqlContext.createDataFrame([(l, ) for l in cluster], ['cluster'])
    df = df.withColumn(
        "index",
        f.row_number().over(Window.orderBy(f.monotonically_increasing_id())) -
        1)

    amnt = amenities.join(df, amenities.amnt_id == df.index).drop(
        "index", 'amnt_id')
    # amnt.show()

    lat = amnt.select('lat').collect()
    lon = amnt.select('lon').collect()
    cluster = amnt.select('cluster').collect()

    return lat, lon, cluster, centres, amnt
Esempio n. 14
0
def get_recordings_df(mapped_listens_df, metadata, save_path):
    """ Prepare recordings dataframe.

        Args:
            mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping.
            save_path (str): path where recordings_df should be saved

        Returns:
            recordings_df: Dataframe containing distinct recordings and corresponding
                mbids and names.
    """
    recording_window = Window.orderBy('mb_recording_mbid')

    recordings_df = mapped_listens_df.select('mb_artist_credit_id',
                                             'mb_artist_credit_mbids',
                                             'mb_recording_mbid',
                                             'mb_release_mbid',
                                             'msb_artist_credit_name_matchable',
                                             'msb_recording_name_matchable') \
                                     .distinct() \
                                     .withColumn('recording_id', rank().over(recording_window))

    metadata['recordings_count'] = recordings_df.count()
    save_dataframe(recordings_df, save_path)
    return recordings_df
Esempio n. 15
0
def calculate_distance(enhanced_checkin_data):
    """ Create distance (km) column in PySpark dataframe 

    Parameters
    ----------
    enhanced_checkin_data: Dataframe that has gone through parse_and_clean_swarm_venue_responses function or
        has following equivalent columns: created_at, lat, lng
   

    Returns
    -------
    Same dataframe that has an additional column called distance_in_km
    
    Notes
    -----
    

    """
    # remove records that do not have geocoordinates 
    result = enhanced_checkin_data.filter(F.col('lat').isNotNull())
    
    # add prior latitude and longitude as new columns for every record
    w1 = Window.orderBy(result.createdAt.asc())
    result = result.withColumn('prior_latitude', F.lag(F.col('lat'), 1).over(w1)) \
        .withColumn('prior_longitude', F.lag(F.col('lng'), 1).over(w1)) \
        .withColumn('prior_name', F.lag(F.col('name'), 1).over(w1)) \
        .withColumn('prior_country', F.lag(F.col('country'), 1).over(w1))
    
    # remove the first data point for the calculation given there is no distance to be performed
    result = result.filter(F.col('prior_latitude').isNotNull())

    # calculate distance in km leveraging udf define below
    result = result.withColumn('distance_in_km', calculate_distance_udf('lat', 'lng', 'prior_latitude', 'prior_longitude'))

    return result 
Esempio n. 16
0
def read_glove_vecs(glove_file, output_path):
    rdd = sc.textFile(glove_file)
    row = Row("glovevec")
    df = rdd.map(row).toDF()
    split_col = F.split(F.col('glovevec'), " ")
    df = df.withColumn('word', split_col.getItem(0))
    df = df.withColumn('splitted', split_col)
    vec_udf = F.udf(lambda row: [float(i) for i in row[1:]],
                    ArrayType(FloatType()))
    df = df.withColumn('vec', vec_udf(F.col('splitted')))
    df = df.drop('splitted', "glovevec")
    w = Window.orderBy(["word"])
    qdf = df.withColumn('vec',
                        F.concat_ws(',',
                                    'vec')).withColumn("id",
                                                       F.row_number().over(w))

    path = '{}/words'.format(output_path)
    qdf.coalesce(1).write.format('csv').option("sep",
                                               "\t").option('header',
                                                            'true').save(path)
    print('Words saved to: "{}"'.format(path))
    list_words = list(map(lambda row: row.asDict(), qdf.collect()))
    word_to_vec_map = {item['word']: item['vec'] for item in list_words}
    words_to_index = {item['word']: item["id"] for item in list_words}
    index_to_words = {item["id"]: item['word'] for item in list_words}
    return words_to_index, index_to_words, word_to_vec_map
Esempio n. 17
0
def get_top_charts_globally(raw_df, chart_length):
    """

    :param raw_df: Raw data to be processed
    :param chart_length: the number of records to be displayed
    :return:  clean data frame containing top ranked combinaton of atrist to track globally
    """
    df_raw = raw_df.select(
        'tagid',
        col('match.track.id').alias('track_id'),
        col('match.track.metadata.tracktitle').alias('track_title'),
        col('match.track.metadata.artistname').alias('artist_name'))
    # Getting distinct counts of tagids for each track
    df_agg = df_raw.groupBy("track_id").agg(
        countDistinct("tagid").alias('tag_id_count')).orderBy(
            desc("tag_id_count"))
    # Joining back to the raw dataframe as we need artist name and track name for display

    df_final = df_agg.join(df_raw,
                           df_raw['track_id'] == df_agg['track_id'],
                           how="inner")
    # Using dense_rank function to rank each record based on the tag_id_cout
    df_final = df_final.select(
        'artist_name', 'track_title', 'tag_id_count').withColumn(
            "chart_position",
            dense_rank().over(Window.orderBy(desc("tag_id_count")))).orderBy(
                desc("tag_id_count")).dropDuplicates()
    df_final = df_final.select('chart_position', 'track_title', 'artist_name')
    return df_final
Esempio n. 18
0
def score_recommender(spark, model, df_movie, k, user_id):
    from pyspark.sql.window import Window
    from pyspark.sql.functions import row_number, col

    # rec_items = model.recommendForAllUsers(k)

    df_scoring = create_scoring_data(user_id, df_movie)

    dfs_scoring = spark.createDataFrame(df_scoring)

    dfs_predictions = model.transform(dfs_scoring) 
    dfs_predictions = dfs_predictions.dropDuplicates(['user', 'item', 'prediction'])

    window = Window.orderBy(dfs_predictions['prediction'].desc())

    df_topk = dfs_predictions \
        .select('*', row_number().over(window).alias('row_number')) \
        .filter(col('row_number') <= k) \
        .drop('row_number', 'prediction', 'rating') \
        .toPandas()

    recs = list(df_topk.item)

    print(recs)

    return recs
Esempio n. 19
0
    def find_most_common_trees(self, df):
        """Find the top 5 most commonly occurred tree  types in San Francisco area

        :param df: Input DataFrame containing all details of trees
        :return: Dataframe of top 5 common tree types
        """

        # create split rule to find the tree sub type
        split_col = split(df['species'], '::')

        # a dataframe of trees with only required fields
        comm_trees = df.select('species',
                               'tree_id').withColumn('tree_type',
                                                     split_col.getItem(1))

        # filter out the trees with no or unknown sub types and get the count of valid sub types
        comm_tree_df = (comm_trees.select('tree_type').filter(
            col('tree_type') != '').groupBy('tree_type').count().orderBy(
                col('count').desc()))

        # find the top 5 commonly occured tree types by using ranking
        most_comm_trees = comm_tree_df.withColumn(
            "rank",
            rank().over(Window.orderBy(col("count").desc()))).filter(
                col("rank") <= 5).select('tree_type', 'count')

        self.log.warn('Found the top 5 most common trees in San Francisco')

        return most_comm_trees
Esempio n. 20
0
def create_train_data():

    w1 = Window.orderBy("uid")
    w2 = Window.partitionBy("seg").orderBy("uid")
    df_train = spark.read.csv(
        os.path.join("datasets", "train.csv"), header=True,
        schema=schema).withColumn(
            "uid", monotonically_increasing_id()).withColumn(
                "idx",
                row_number().over(w1).cast(IntegerType())).withColumn(
                    "seg",
                    fn.floor(((fn.col("idx") - 1) / 150000)).cast(
                        IntegerType())).withColumn(
                            "no",
                            row_number().over(w2).cast(
                                IntegerType())).withColumn(
                                    "name",
                                    fn.concat(
                                        lit("raw_"),
                                        fn.lpad(fn.col("seg"), 4, "0").cast(
                                            StringType()))).withColumn(
                                                "set", lit(0))

    df_train.createOrReplaceTempView("data")
    df_train_f = spark.sql("""
    SELECT uid, set, seg, no, name, x, y FROM data 
    ORDER BY set, seg, no, uid
    """)

    df_train_f = df_train_f.repartition(1)
    df_train_f.write.mode("overwrite").parquet(
        os.path.join("datasets", "train.parquet"))
Esempio n. 21
0
def save_dataset(df_pos, df_neg, path):
    df = df_pos.union(df_neg)
    w = Window.orderBy(["words_stemmed"])
    df = df.withColumn("review_id", F.row_number().over(w)).withColumn('int_seq',
                                                                       F.concat_ws(',', 'words_stemmed'))
    qdf = df.select(['review_id', 'int_seq', 'class'])
    qdf.coalesce(1).write.format('csv').option('header', 'true').save(path)
Esempio n. 22
0
def top_zip_crashes_alcohol(df_unit, df_person, top_n):
    """
    :param df_unit:
    :type df_unit:
    :param df_person:
    :type df_person:
    :param top_n:
    :type top_n:
    :return:
    :rtype:
    """
    df_joined = df_person.join(df_unit, ['CRASH_ID'], 'left') \
        .select(['CRASH_ID', 'DRVR_ZIP', 'CONTRIB_FACTR_1_ID',
                 'CONTRIB_FACTR_2_ID', 'CONTRIB_FACTR_P1_ID']) \
        .drop_duplicates()
    wspec = Window.orderBy(desc('crash_count'))
    intd_df = df_joined.filter((df_joined['CONTRIB_FACTR_1_ID'].like('%DRINKING%')) |
                        (df_joined['CONTRIB_FACTR_1_ID'].like('%ALCOHOL%')) |
                        (df_joined['CONTRIB_FACTR_2_ID'].like('%DRINKING%')) |
                        (df_joined['CONTRIB_FACTR_2_ID'].like('%ALCOHOL%')) |
                        (df_joined['CONTRIB_FACTR_P1_ID'].like('%DRINKING%')) |
                        (df_joined['CONTRIB_FACTR_P1_ID'].like('%ALCOHOL%')))
    intd_df1 = intd_df.groupBy('DRVR_ZIP').agg(countDistinct('CRASH_ID').alias('crash_count')) \
        .orderBy(desc('crash_count')).dropna()
    intd_df2 = intd_df1.withColumn('rank', dense_rank().over(wspec))
    driver_zip_obj = intd_df2.filter(intd_df2['rank'] <= top_n).collect()
    list_driver_zip = [row['DRVR_ZIP'] for row in driver_zip_obj]
    return list(enumerate(list_driver_zip, start=1))
def convert_annoy_index(item_factors):
    window = Window.orderBy('id')
    item_factors = item_factors.withColumn('annoy_id',
                                           row_number().over(window))
    annoy_index_map = item_factors.select('id', 'annoy_id')
    item_factors = item_factors.select('annoy_id', 'features')
    return item_factors, annoy_index_map
Esempio n. 24
0
def convert_idf_score_to_buckets(tenant_idf):
    global IDF_VALUE
    global BUCKETS
    over_all = Window.orderBy(IDF_VALUE)
    tenant_idf_with_bucket = tenant_idf.withColumn(
        IDF_VALUE,
        ntile(BUCKETS).over(over_all))
    print("SystemLog: Done converting to bucketized score")
    return tenant_idf_with_bucket
def get_cdf(df, variable, col_name):

    cdf = df.select(variable).na.drop().\
        withColumn(
            col_name,
            funcs.cume_dist().over(Window.orderBy(variable))
        ).distinct()

    return cdf
Esempio n. 26
0
        def readFromTables(column, row_name='temp_rownum'):
        df = spark.read.table("temp.sample_{}_{}".format(table, 0))
        df = df_column.select(column)
        for i in range(len(partitions)-1):
            df_temp = spark.read.table("temp.sample_{}_{}".format(table, i+1))
            df = df.union(df_temp)
        df = df\
            .orderBy(column)\
            .withColumn(row_name, F.row_number().over(W.orderBy(column)))\
            .limit(threshold)
        return df

        ## Set parameters
        source = self.source
        db, table = tablename.split(sep='.')
        print("Counting ...")
        columns = [ column for column in source.columns if source.select(column).distinct().count() < threshold ]
        print("only enumerating: ", columns)

        ## is the table have partition? if yes devide by the partitions!!
        try:
            partitions = spark.sql("""SHOW PARTITIONS {}""".format(tablename)).limit(1).collect()
            print("ALERT!! Partition File, it'll takes longer time")

            ### iterate all partition file then yield some distinct element of each part
            for i, partition_str in enumerate(partitions):
                filter_condition = partition_str[0].split(sep='/')
                filter_condition = ' AND '.join(filter_condition)

                print("Executing {}".format(filter_condition))
                df_part = distinctElement(source.filter(filter_condition), columns)

                print("Saving into temp.sample_{}_{}".format(table,i))
                df_part.write\
                .format("parquet")\
                .mode("overwrite")\
                .saveAsTable("temp.sample_{}_{}".format(table, i))

            ### iterate read file, and yield global distinct elements of all partitions
            print("Re-reading")
            listoftuple = [(x+1,) for x in range(threshold)]
            df_column = spark.createDataFrame(listoftuple, schema = ['index'])
            for column in columns:
                df_temp = readFromTables(column)
                df_column = df_column.join(df_temp, df_temp.temp_rownum==df_column.column_rownum, 'left')
            df_all = df_column.orderBy('index').drop('index')

            ### Drop all sample table
            print("Drop all sample table")
            for i, partition_str in enumerate(partitions):
                spark.sql(""" DROP TABLE temp.sample_{}_{}""".format(table, i))
            return df_all
        
        ## if only the table not partitioned
        except:
            return distinctElement(self.source, columns)
Esempio n. 27
0
def transform_data_with_udf(clickstream_data, purchase_data):
    window1 = Window.partitionBy('userId').orderBy('eventTime')
    window2 = Window.orderBy('sessionId')

    clickstream_data = (clickstream_data.withColumn(
        'appOpenFlag',
        app_open_flag_udf(clickstream_data['eventType'])).withColumn(
            'sessionId',
            sum(col('appOpenFlag')).over(window1)).withColumn(
                'attr',
                attributes_udf(
                    clickstream_data['eventType'],
                    clickstream_data['attributes'])).withColumn(
                        'campaign_id',
                        when(
                            get_json_object('attr',
                                            '$.campaign_id').isNotNull(),
                            get_json_object('attr',
                                            '$.campaign_id')).otherwise(None)
                    ).withColumn(
                        'channel_id',
                        when(
                            get_json_object('attr',
                                            '$.channel_id').isNotNull(),
                            get_json_object(
                                'attr',
                                '$.channel_id')).otherwise(None)).withColumn(
                                    'purchase_id',
                                    when(
                                        get_json_object(
                                            'attr',
                                            '$.purchase_id').isNotNull(),
                                        get_json_object(
                                            'attr',
                                            '$.purchase_id')).otherwise(None)).
                        withColumn(
                            'campaignId',
                            last(col('campaign_id'), ignorenulls=True).over(
                                window2.rowsBetween(
                                    Window.unboundedPreceding, 0))).withColumn(
                                        'channelId',
                                        last(col('channel_id'),
                                             ignorenulls=True).over(
                                                 window2.rowsBetween(
                                                     Window.unboundedPreceding,
                                                     0))))

    target_df = clickstream_data.join(
        purchase_data,
        clickstream_data['purchase_id'] == purchase_data['purchaseId'],
        JOIN_TYPE.LEFT)

    return target_df.select(col('purchaseId'), col('purchaseTime'),
                            col('billingCost'), col('isConfirmed'),
                            col('sessionId'), col('campaignId'),
                            col('channelId'))
Esempio n. 28
0
    def data_range(self, verbose=True):
        """
        Ensures variables within the dataframe well_df are within range, as set by the attribute thresholds. The out of
        range values are replaced by the previous in range value

        Parameters
        ----------
        verbose : bool (optional)
            whether to allow for verbose (default is True)
        """
        window = Window.orderBy("ts")  # Spark Window ordering data frames by time

        lag_names = []  # Empty list to store column names
        for well_columns in self.well_df.schema.names:  # loop through all components (columns) of data

            if well_columns != "ts":  # no tresholding for timestamp

                if well_columns in self.thresholds.keys():
                    tresh = self.thresholds[well_columns]  # set thresholds values for parameter from dictionary
                else:
                    tresh = [-1000, 1000]  # if feature not in thresholds attribute, set large thresholds

                if verbose:
                    print(well_columns, "treshold is", tresh)

                for i in range(1, 10):  # Naive approach, creating large amount of lagged features columns
                    lag_col = well_columns + "_lag_" + str(i)
                    lag_names.append(lag_col)
                    self.well_df = self.well_df.withColumn(lag_col, F.lag(well_columns, i, 0).over(window))

                for i in range(8, 0, -1):
                    lag_col = well_columns + "_lag_" + str(i)
                    prev_lag = well_columns + "_lag_" + str(i + 1)

                    # apply minimum and maximum threshold to column, and replace out of range values with previous value
                    self.well_df = self.well_df.withColumn(lag_col,
                                                           F.when(F.col(lag_col) < tresh[0],
                                                                  F.col(prev_lag))
                                                           .otherwise(F.col(lag_col)))
                    self.well_df = self.well_df.withColumn(lag_col,
                                                           F.when(F.col(lag_col) > tresh[1],
                                                                  F.col(prev_lag)).otherwise(F.col(lag_col)))

                # apply minimum and maximum threshold to column, and replace out of range values with previous value
                lag_col = well_columns + "_lag_1"
                self.well_df = self.well_df.withColumn(well_columns,
                                                       F.when(F.col(well_columns) < tresh[0],
                                                              F.col(lag_col))
                                                       .otherwise(F.col(well_columns)))
                self.well_df = self.well_df.withColumn(well_columns,
                                                       F.when(F.col(well_columns) > tresh[1],
                                                              F.col(lag_col))
                                                       .otherwise(F.col(well_columns)))

        self.well_df = self.well_df.drop(*lag_names)
        return
Esempio n. 29
0
def running_total():
    input_data_path = os.path.join(folder_path, 'orders_data', 'orders.csv')
    df = sqlContext.read \
        .option("multiline", "true") \
        .option("header", "true") \
        .csv(input_data_path)
    wind = Window.orderBy("id")
    windCol = functions.sum("orderQty").over(wind)
    df.select("*", windCol.alias("totalQuantity").cast(IntegerType())) \
        .show()
def split_by_row_index(df, num_partitions=2):
    # Let's assume you don't have a row_id column that has the row order
    t = df.withColumn('_row_id', monotonically_increasing_id())
    # Using ntile() because monotonically_increasing_id is discontinuous across partitions
    t = t.withColumn('_partition',
                     ntile(num_partitions).over(Window.orderBy(t._row_id)))
    return [
        t.filter(t._partition == i + 1).drop('_row_id', '_partition')
        for i in range(num_partitions)
    ]
Esempio n. 31
0
def lag_generater(crimeWeek):
    crimeWeek = crimeWeek.select(
        "*",
        lag("count").over(
            Window.orderBy("yearweek")).alias("count_lag1")).na.drop()
    crimeWeek = crimeWeek.select(
        "*",
        lag("count_lag1").over(
            Window.orderBy("yearweek")).alias("count_lag2")).na.drop()
    crimeWeek = crimeWeek.select(
        "*",
        lag("count_lag2").over(
            Window.orderBy("yearweek")).alias("count_lag3")).na.drop()
    crimeWeek = crimeWeek.select(
        "*",
        lag("count_lag3").over(
            Window.orderBy("yearweek")).alias("count_lag4")).na.drop()
    crimeWeek = crimeWeek.withColumnRenamed("count", "label")
    return crimeWeek
Esempio n. 32
0
def runOtherFunctions(spark, personDf):
    df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]);

    # array
    df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False)

    # desc, asc
    personDf.show()
    personDf.sort(functions.desc("age"), functions.asc("name")).show()

    # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음

    # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능)
    df2 = spark.createDataFrame([("Splits str around pattern",)], ['value'])
    df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False)

    # rownum, rank
    f1 = StructField("date", StringType(), True)
    f2 = StructField("product", StringType(), True)
    f3 = StructField("amount", IntegerType(), True)
    schema = StructType([f1, f2, f3])

    p1 = ("2017-12-25 12:01:00", "note", 1000)
    p2 = ("2017-12-25 12:01:10", "pencil", 3500)
    p3 = ("2017-12-25 12:03:20", "pencil", 23000)
    p4 = ("2017-12-25 12:05:00", "note", 1500)
    p5 = ("2017-12-25 12:05:07", "note", 2000)
    p6 = ("2017-12-25 12:06:25", "note", 1000)
    p7 = ("2017-12-25 12:08:00", "pencil", 500)
    p8 = ("2017-12-25 12:09:45", "note", 30000)

    dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema)
    w1 = Window.partitionBy("product").orderBy("amount")
    w2 = Window.orderBy("amount")
    dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"),
              functions.rank().over(w2).alias("rank")).show()
def collect_numeric_metric(metric, df, population):
    cdf = df.select(df[metric['src']])
    cdf = cdf.dropna(subset=metric['src'])
    cdf = cdf.select(cdf[metric['src']].cast('float').alias('bucket'))

    total_count = cdf.count()
    num_partitions = total_count / 500
    ws = Window.orderBy('bucket')
    cdf = cdf.select(
        cdf['bucket'],
        cume_dist().over(ws).alias('c'),
        row_number().over(ws).alias('i'))
    cdf = cdf.filter("i = 1 OR i %% %d = 0" % num_partitions)
    cdf = cdf.collect()

    # Collapse rows with duplicate buckets.
    collapsed_data = []
    prev = None
    for d in cdf:
        if not collapsed_data:
            collapsed_data.append(d)  # Always keep first record.
            continue
        if prev and prev['bucket'] == d['bucket']:
            collapsed_data.pop()
        collapsed_data.append(d)
        prev = d

    # Calculate `p` from `c`.
    data = []
    prev = None
    for i, d in enumerate(collapsed_data):
        p = d['c'] - prev['c'] if prev else d['c']
        data.append({
            'bucket': d['bucket'],
            'c': d['c'],
            'p': p,
        })
        prev = d
    """
    Example of what `data` looks like now::

        [{'bucket': 0.0,        'c': 0.00126056, 'p': 0.00126056},
         {'bucket': 3.0,        'c': 0.00372313, 'p': 0.00246256},
         {'bucket': 4.0,        'c': 0.00430616, 'p': 0.0005830290622683026},
         {'bucket': 6.13319683, 'c': 0.00599801, 'p': 0.00169184},
         {'bucket': 8.0,        'c': 0.08114486, 'p': 0.07514685},
         {'bucket': 8.23087882, 'c': 0.08197282, 'p': 0.00082795},
         ...]
    """
    # Push data to database.
    sql = ("INSERT INTO api_numericcollection "
           "(num_observations, population, metric_id, dataset_id) "
           "VALUES (%s, %s, %s, %s) "
           "RETURNING id")
    params = [total_count, population, metric['id'], dataset_id]
    if DEBUG_SQL:
        collection_id = 0
        print sql, params
    else:
        cursor.execute(sql, params)
        conn.commit()
        collection_id = cursor.fetchone()[0]

    for d in data:
        sql = ("INSERT INTO api_numericpoint "
               "(bucket, proportion, collection_id) "
               "VALUES (%s, %s, %s)")
        params = [d['bucket'], d['p'], collection_id]
        if DEBUG_SQL:
            print sql, params
        else:
            cursor.execute(sql, params)

    if not DEBUG_SQL:
        conn.commit()