Exemple #1
0
def fingerprint_cluster(df, input_cols):
    """
    Cluster a dataframe column based on the Fingerprint algorithm
    :param df: Dataframe to be processed
    :param input_cols: Columns to be processed
    :return:
    """
    # df = self.df
    input_cols = parse_columns(df, input_cols)

    for input_col in input_cols:
        output_col = name_col(input_col, FINGERPRINT_COL)
        # Instead of apply the fingerprint to the whole data set we group by names
        df = (
            df.groupBy(input_col).count().select(
                'count', input_col).repartition(
                    1)  # Needed for optimization in a single machine
            .cache())
        # Calculate the fingeprint
        df = fingerprint(df, input_col)

        count_col = name_col(input_col, COUNT_COL)
        cluster_col = name_col(input_col, CLUSTER_COL)
        recommended_col = name_col(input_col, RECOMMENDED_COL)
        cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL)

        df = df.groupby(output_col).agg(
            F.collect_set(input_col).alias(cluster_col),
            F.sum("count").alias(count_col),
            F.first(input_col).alias(recommended_col),
            F.size(F.collect_set(input_col)).alias(cluster_size_col)).select(
                cluster_size_col, cluster_col, count_col, recommended_col)
    return df
Exemple #2
0
def n_gram_fingerprint_cluster(df, input_cols, n_size=2):
    """
    Cluster a DataFrame column based on the N-Gram Fingerprint algorithm
    :param df: Dataframe to be processed
    :param input_cols: Columns to be processed
    :param n_size:
    :return:
    """
    input_cols = parse_columns(df, input_cols)
    for input_col in input_cols:
        ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL)

        # Prepare a group so we do not need to apply the fingerprint to the whole data set
        df = (
            df.select(input_col).groupBy(input_col).count().select(
                'count', input_col).repartition(
                    1)  # Needed for optimization in a single machine
            .cache())

        df = n_gram_fingerprint(df, input_col, n_size)

        count_col = name_col(input_col, COUNT_COL)
        cluster_col = name_col(input_col, CLUSTER_COL)
        recommended_col = name_col(input_col, RECOMMENDED_COL)
        cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL)

        df = df.groupby(ngram_fingerprint_col).agg(
            F.collect_set(input_col).alias(cluster_col),
            F.sum("count").alias(count_col),
            F.first(input_col).alias(recommended_col),
            F.size(F.collect_set(input_col)).alias(cluster_size_col)).select(
                cluster_size_col, cluster_col, count_col, recommended_col)

        return df
Exemple #3
0
 def nunique(self, df):
     """ Calculates number of unique values in a column over a window"""
     w = self.get_window(self.partition_by, self.order_by,
                         self.window_length)
     return df.withColumn(
         self.column_alias,
         psf.size(psf.collect_set(self.aggregation_column).over(w)))
Exemple #4
0
def tf_idf(df, n):
    # Extracting terms per each row/document as a list
    temp_df = df.withColumn(
        'terms',
        f.split(f.lower(f.regexp_replace(df.text_entry, '[^\\w\\s-]', '')),
                ' '))

    # Calculating total number of words per row/document
    temp_df1 = temp_df.withColumn('total_num_words', f.size('terms'))

    # Extracting words in each documents
    temp_df2 = temp_df1.withColumn('token', f.explode('terms'))

    # Calculating tf
    temp_df3 = temp_df2.groupBy('_id', 'token', 'total_num_words').agg({
        'token':
        'count'
    }).withColumnRenamed('count(token)', 'occurrence').sort('_id')
    temp_df4 = temp_df3.withColumn('tf', temp_df3.occurrence)

    # Calculating df
    temp_df5 = temp_df4.groupBy('token').agg(
        f.countDistinct('_id')).withColumnRenamed('count(DISTINCT _id)', 'df')

    # Calculating idf
    temp_df6 = temp_df5.withColumn('idf', f.log10(n / temp_df5.df))

    # Calculating tf-idf
    joined_df = temp_df4.join(temp_df6,
                              temp_df4.token == temp_df6.token).select(
                                  temp_df4.token, temp_df4._id, temp_df4.tf,
                                  temp_df6.df, temp_df6.idf)
    result = joined_df.withColumn('tf_idf', joined_df.tf * joined_df.idf)

    return result
Exemple #5
0
def frequent_itemsets(filename, n, s, c):
    '''
    Using the FP-Growth algorithm from the ML library (see 
    http://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html), 
    write a function that returns the first <n> frequent itemsets 
    obtained using min support <s> and min confidence <c> (parameters 
    of the FP-Growth model), sorted by (1) descending itemset size, and 
    (2) descending frequency. The FP-Growth model should be applied to 
    the DataFrame computed in the previous task. 
    
    Return value: a CSV string. As before, using toCSVLine may help.
    Test: tests/test_frequent_items.py
    '''
    spark = init_spark()
    result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map(lambda x: (x[1], x[0][0], x[0][1:]))
    df = spark.createDataFrame(result, ['id', 'plant','items'])

    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)
    model = fpGrowth.fit(df)
    result=model.freqItemsets


    result=result.select("items","freq",size("items").alias("tam"))
    result=result.sort(desc('tam'),desc('freq')).limit(n)
    result=result.select('items','freq')

    return toCSVLine(result)
Exemple #6
0
def n_gram_fingerprint_cluster(df, columns, n_size=2):
    """
    Cluster a DataFrame column based on the N-Gram Fingerprint algorithm
    :param df:
    :param columns:
    :param n_size:
    :return:
    """
    columns = parse_columns(df, columns)
    for col_name in columns:
        n_gram_col = col_name + "_ngram_fingerprint"

        # Prepare a group so we don need to apply the fingerprint to the whole data set
        df = (df.select(col_name)
              .groupBy(col_name)
              .count()
              .select('count', col_name)
              .repartition(1)  # Needed for optimization in a single machine
              .cache())

        df = n_gram_fingerprint(df, col_name, n_size)
        # df.table()
        df = df.groupby(n_gram_col).agg(
            F.collect_set(col_name).alias("cluster"),
            F.sum("count").alias("count"),
            F.first(col_name).alias("recommended"),
            F.size(F.collect_set(col_name)).alias("cluster_size")
        ).select("cluster_size", "cluster", "count", "recommended")

        return df
Exemple #7
0
def fingerprint_cluster(df, columns):
    """
    Cluster a dataframe column based on the Fingerprint algorithm
    :param df:
    :param columns: Columns to be processed
    :return:
    """
    # df = self.df
    columns = parse_columns(df, columns)

    for col_name in columns:
        output_col = col_name + "_FINGERPRINT"
        # Instead of apply the fingerprint to the whole data set we group by names
        df = (df
              .groupBy(col_name)
              .count()
              .select('count', col_name)
              .repartition(1)  # Needed for optimization in a single machine
              .cache()
              )
        # Calculate the fingeprint
        df = fingerprint(df, col_name)

        # Create cluster
        df = df.groupby(output_col).agg(
            F.collect_set(col_name).alias("cluster"),
            F.sum("count").alias("count"),
            F.first(col_name).alias("recommended"),
            F.size(F.collect_set(col_name)).alias("cluster_size")
        ) \
            .select("cluster_size", "cluster", "count", "recommended")
    return df
    def preprocess(self, data):
        data = data.withColumn(
            "_c0", functions.expr("substring(_c0, 2, length(_c0)-1)"))
        data = data.withColumn(
            "_c3", functions.expr("substring(_c3, 1, length(_c3)-1)"))
        data = data.withColumnRenamed("_c0", "form_id") \
            .withColumnRenamed("_c1", "views") \
            .withColumnRenamed("_c2", "submissions") \
            .withColumnRenamed("_c3", "features")

        data = data.select('form_id', 'views', 'submissions',
                           functions.split('features', '-').alias('features'))
        df_sizes = data.select(functions.size('features').alias('features'))
        df_max = df_sizes.agg(functions.max('features'))
        nb_columns = df_max.collect()[0][0]
        data = data.select('form_id', 'views', 'submissions',
                           *[data['features'][i] for i in range(nb_columns)])

        data = data.select(*(functions.col(column).cast("float").alias(column)
                             for column in data.columns))
        data = data.withColumn('form_id', functions.col('form_id').cast('int'))
        data = data.withColumn('views', functions.col('views').cast('int'))
        data = data.withColumn('submissions',
                               functions.col('submissions').cast('int'))
        data = data.withColumn(
            "submission_ratio",
            functions.col("submissions") / functions.col("views"))
        return data
Exemple #9
0
def amenities_rating(spark, amenities_pref, newh_df):
    pa_df = pd.DataFrame(amenities_pref, columns=["amenities_pref"])

    a_df = spark.createDataFrame(pa_df)
    a_df.createOrReplaceTempView('a_df')

    newh_df.createOrReplaceTempView('del_dup')
    newa_df = spark.sql(
        "SELECT * FROM newh_df INNER JOIN a_df WHERE newh_df.amenities=a_df.amenities_pref"
    )

    ameni_comb = newa_df.groupBy(functions.col("id")).agg(
        functions.collect_list(functions.col("amenities")).alias("amenities"))

    amenities_len = ameni_comb.withColumn(
        "ameni_len", functions.size(ameni_comb["amenities"])).orderBy(
            functions.col("ameni_len"), ascending=False)
    amenities_len.createOrReplaceTempView("amenities_len")

    ameni_df = spark.sql(
        "SELECT a.id,h.amenities,a.ameni_len FROM del_dup h INNER JOIN amenities_len a WHERE h.id=a.id ORDER BY a.ameni_len DESC"
    )

    find_rating = functions.udf(lambda a: get_rating(a), types.IntegerType())
    usr_rating = ameni_df.withColumn("rating", find_rating("ameni_len"))
    return usr_rating
Exemple #10
0
def sort_by_comment_length(epoch_df, batch_size=16):
    """
    TEST FUNCTION:
    Takes in a Spark dataframe
    Returns: A list of token, label tuples ordered
    by token sequence length
    """

    # add sequence lengths
    epoch_df = epoch_df.withColumn("sequence_length", F.size(epoch_df.tokens))

    # order by sequence length
    epoch_df = epoch_df.orderBy("sequence_length", ascending=False)

    # drop sequence length column
    epoch_df = epoch_df.drop("sequence_length")

    # convert pandas
    epoch_df = epoch_df.toPandas()

    # convert to sorted list of tuples
    sorted_tokens = [(epoch_df['tokens'].iloc[i], epoch_df['label'].iloc[i])
                     for i in range(len(epoch_df))]

    return sorted_tokens
Exemple #11
0
def preparar_df(df):
    df.repartition(df.user.id)

    df = df.where(F.length(df.text) > 0)
    df = df.select(
        "*",
        u_parse_time(
            df['created_at']).cast('timestamp').alias('created_at_ts'))

    df_intertweet = df.select(
        df.user.id.alias("user_id"),
        (df.created_at_ts.cast('bigint') -
         F.lag(df.created_at_ts.cast('bigint'), ).over(
             Window.partitionBy("user.id").orderBy("created_at_ts"))
         ).cast("bigint").alias("time_intertweet"))

    df_list_intertweet = df_intertweet.groupby(df_intertweet.user_id).agg(
        F.collect_list("time_intertweet").alias("lista_intertweet"))

    df_list_intertweet = df_list_intertweet.filter(
        F.size(df_list_intertweet.lista_intertweet) > 3)

    df = df.join(df_list_intertweet,
                 df["user.id"] == df_list_intertweet["user_id"])

    return df
Exemple #12
0
def process():

    data_content = [x.strip().split(',') for x in open(FILE_PATH).readlines()]
    data_content_tuple = []
    for i in range(0, len(data_content)):
        data_content_tuple.append((i, data_content[i]))

    df = spark.createDataFrame(data_content_tuple, ["id", "items"])

    fpGrowth = FPGrowth(itemsCol="items", minSupport=0.1, minConfidence=0.5)
    model = fpGrowth.fit(df)

    # Display frequent itemsets.
    # model.freqItemsets

    model.freqItemsets.filter(size('items') > 0).orderBy('freq',
                                                         ascending=0).show(
                                                             50, False)

    print(type(model.freqItemsets))

    # Display generated association rules.
    model.associationRules.orderBy('confidence', ascending=0).show(200, False)

    # transform examines the input items against all the association rules and summarize the
    # consequents as prediction
    model.transform(df).show(50, False)
Exemple #13
0
def interests(filename, n, s, c):
    '''
    Using the same FP-Growth algorithm, write a script that computes 
    the interest of association rules (interest = |confidence - 
    frequency(consequent)|; note the absolute value)  obtained using 
    min support <s> and min confidence <c> (parameters of the FP-Growth 
    model), and prints the first <n> rules sorted by (1) descending 
    antecedent size in association rule, and (2) descending interest.

    Return value: a CSV string.
    Test: tests/test_interests.py
    '''
    spark = init_spark()
    result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map(
        lambda x: (x[1], x[0][0], x[0][1:]))
    df = spark.createDataFrame(result, ['id', 'plant', 'items'])

    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)
    model = fpGrowth.fit(df)
    result = model.associationRules
    modelResult = model.freqItemsets
    result=modelResult.join(result,modelResult['items']==result["consequent"])
    total = df.count()

    result = result.withColumn("interest",abs(result["confidence"]-result["freq"]/total))
    result = result.select(size("antecedent").alias('tam'), 'antecedent', 'consequent', 'confidence',"items","freq","interest")
    result = result.sort(desc('tam'), desc('interest')).limit(n)
    result=result.select('antecedent', 'consequent', 'confidence',"items","freq","interest")

    return toCSVLine(result)
Exemple #14
0
def generate_TFIDF(sc, df , sqlcontext):
  
	# 1. calculate the number of rows(documents) in data framework
    t_num = df.count()    
    
	# 2. select _id and lower the text_entry and remove punctuation symbols 
	#and then split it as a list of words('tokens')
    word_spilits = df.select("_id",F.split(F.lower(F.regexp_replace(df.text_entry,'[^\w\s]' ,'')),' ').alias('tokens'))
	
	# 3. explode the list of words to generate a list of _id and token
	#then, group the list base on _id and token to calculate frequency of tokens (tf) in each row
	# to create a  data framework words_tf (_id , token , tf)
    words_tf = word_spilits.select("_id", F.explode(word_spilits.tokens).alias('token'))\	
	.groupBy("_id", "token").agg({'token': 'count'}).withColumnRenamed("count(token)", "tf")
	
	# 4. to calculate frequency of token in document (df), I aggregate the list base on token 
	# and created a set of _ids with duplicate _ids eliminated ('collect_set')
	# and calculated the number of _ids and document frequency of a token
	# to create a data framework words_df (_id , token , df)
    words_df = words_tf.groupby("token").agg(F.collect_set("_id").alias("_ids"))\
	.select("token", F.explode("_ids").alias('_id'), F.size("_ids").alias('df'))
    
	# 5. to calculate the final TFIDF data framework, I joined
	# I joined two data frameworks words_tf and words_df base on same _id and token
	# then calculated the idf by fraction of number of documents (t_num) on document frequency (df)
	# then calculated the tf_idf by multiplying idf and tf
    tokensWithTfIdf = words_tf.join(words_df, (words_tf._id == words_df._id) &  (words_tf.token == words_df.token))\
    .select(words_tf._id , words_tf.token, words_tf.tf , words_df.df,(F.log10(t_num / words_df.df )).alias("idf")\
	, (F.log10(t_num / words_df.df ) * words_tf.tf ).alias("tf_idf") )
    
	# 6. cache the TFIDF data framework for further usage 
    tokensWithTfIdf.cache()
    return tokensWithTfIdf
Exemple #15
0
def interests(filename, n, s, c):
    '''
    Using the same FP-Growth algorithm, write a script that computes 
    the interest of association rules (interest = |confidence - 
    frequency(consequent)|; note the absolute value)  obtained using 
    min support <s> and min confidence <c> (parameters of the FP-Growth 
    model), and prints the first <n> rules sorted by (1) descending 
    antecedent size in association rule, and (2) descending interest.

    Return value: a CSV string.
    Test: tests/test_interests.py
    '''
    spark = init_spark()
    lines = spark.read.text(filename).rdd
    parts = lines.map(lambda row: row.value.split(","))
    rdd_data = parts.map(lambda p: Row(name=p[0], items=p[1:]))
    df = spark.createDataFrame(rdd_data)
    total_count = df.count()
    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)
    model = fpGrowth.fit(df)
    model_updated = model.associationRules.join(
        model.freqItemsets,
        model.associationRules['consequent'] == model.freqItemsets['items'])
    model_with_interest = model_updated.withColumn(
        "interest",
        lit(
            calculate_interest(model_updated.confidence, model_updated.freq,
                               total_count)))
    model_1 = model_with_interest.drop("lift")
    model_2 = model_1.orderBy([size("antecedent"), "interest"],
                              ascending=[0, 0])
    final_op = toCSVLine(model_2.limit(n))
    return final_op
Exemple #16
0
def save_table(df, table_name, partition_keys=None):
    print(f"Saving table: {table_name}")
    output_path = f"s3://{bucket_name}/{output_dir}/{table_name}"
    spark.sql(f"drop table if exists {database_name}.{table_name}")

    df = df\
        .withColumn('dataset_name',
                    f.split(f.split(f.input_file_name(), '/').getItem(f.size(f.split(f.input_file_name(), '/'))-1), '\.').getItem(0))

    if partition_keys is not None:
        df\
            .repartition(*partition_keys)\
            .write\
            .mode("overwrite")\
            .format("parquet")\
            .partitionBy(*partition_keys)\
            .option("path", output_path)\
            .saveAsTable(f"{database_name}.{table_name}")
    else:
        df\
            .coalesce(1)\
            .write\
            .mode("overwrite")\
            .format("parquet")\
            .option("path", output_path)\
            .saveAsTable(f"{database_name}.{table_name}")
    print(f"Table: {table_name} saved")
Exemple #17
0
def text_features(p_df):
    """
    Extracts features derived from the quora question texts.
    :param p_df: A DataFrame.
    :return: A DataFrame.  
    """
    diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType())
    common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType())
    unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType())


    p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2"))
    p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2")))
    p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words"))
    p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words")))
    p_df = p_df.withColumn(
        "unique_chars_q1", unique_chars("question1")
    ).withColumn("unique_chars_q2", unique_chars("question2"))

    assembler = VectorAssembler(
        inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"],
        outputCol="text_features"
    )
    p_df = assembler.transform(p_df)    
    return p_df
def add_user_roles(wmhist, remember_dict):

    def role_filter(rg, role_set):
        if rg is None:
            return False
        else:
            return any(role in role_set for role in rg)

    py_is_admin = lambda rg: role_filter(rg, {"bureaucrat","sysop","steward","arbcom"})

    py_is_bot = lambda rg: role_filter(rg, {"copyviobot","bot"})

    py_is_patroller = lambda rg: role_filter(rg, {"patroller"})

    udf_is_admin = f.udf(py_is_admin,BooleanType())
    udf_is_bot = f.udf(py_is_bot,BooleanType())
    udf_is_patroller = f.udf(py_is_patroller, BooleanType())

    wmhist = wmhist.withColumn("event_user_isadmin", udf_is_admin(wmhist.event_user_groups))
    wmhist = wmhist.withColumn("event_user_isbot1", udf_is_bot(wmhist.event_user_groups))
    wmhist = wmhist.withColumn("event_user_ispatroller", udf_is_patroller(wmhist.event_user_groups))
    wmhist = wmhist.withColumn("event_user_isbot2", f.size(wmhist.event_user_is_bot_by) > 0)

    wmhist = wmhist.withColumn("role_type", f.when(wmhist.event_user_isadmin, "admin").otherwise(
        f.when( (wmhist.event_user_isbot1) | (wmhist.event_user_isbot2),"bot").otherwise(
            f.when(wmhist.event_user_ispatroller, "patroller").otherwise("other")
        )))
    
    return (wmhist, remember_dict)
Exemple #19
0
def levenshtein_cluster(df, input_col):
    """
    Return a dataframe with a string of cluster related to a string
    :param df:
    :param input_col:
    :return:
    """
    # Prepare a group so we don need to apply the fingerprint to the whole data set
    df = df.select(input_col).groupby(input_col).agg(F.count(input_col).alias("count"))
    df = keycollision.fingerprint(df, input_col)

    count_col = name_col(input_col, COUNT_COL)
    cluster_col = name_col(input_col, CLUSTER_COL)
    recommended_col = name_col(input_col, RECOMMENDED_COL)
    cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL)
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)

    df_t = df.groupby(fingerprint_col).agg(F.collect_list(input_col).alias(cluster_col),
                                           F.size(F.collect_list(input_col)).alias(cluster_size_col),
                                           F.first(input_col).alias(recommended_col),
                                           F.sum("count").alias(count_col)).repartition(1)

    # Filter nearest string
    df_l = levenshtein_filter(df, input_col).repartition(1)

    # Create Cluster
    df_l = df_l.join(df_t, (df_l[input_col + "_FROM"] == df_t[fingerprint_col]), how="left") \
        .cols.drop(fingerprint_col) \
        .cols.drop([input_col + "_FROM", input_col + "_TO", input_col + "_LEVENSHTEIN_DISTANCE"])

    return df_l
Exemple #20
0
def get_basket_items(sdf,
                     item_col,
                     *key_cols,
                     include_duplicate_items=True,
                     exclude_single_item_baskets=True):
    """ generate sets of items from a table listing items individually (along with the group they belong to)
  
  Args:
    sdf: input Spark dataframe
    item_col: the name of the column indicating the item
    *key_cols: the names of the columns that collectively indicate the group the item should be placed in
    include_duplicate_items:
    exclude_single_item_basekets: if Trye, baskets with only a single item will be filtered out
  
  Notes:
    This function is more or less equivalent to this SQL query:
    
     select patient, encounters.id encounter, encounter_date, 
         collect_list(distinct condition) items, 
         count(distinct condition) num_items
       from encounters 
       group by patient, encounter, encounter_date

  """
    collect_fun = fn.collect_list if include_duplicate_items else fn.collect_set
    basket_items = sdf \
      .groupby(*key_cols) \
      .agg(
        collect_fun(item_col).alias('items')
      )

    if exclude_single_item_baskets:
        basket_items = basket_items.filter(fn.size('items') > 1)

    return basket_items
Exemple #21
0
def frequent_itemsets(filename, n, s, c):
    '''
    Using the FP-Growth algorithm from the ML library (see 
    http://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html), 
    write a function that returns the first <n> frequent itemsets 
    obtained using min support <s> and min confidence <c> (parameters 
    of the FP-Growth model), sorted by (1) descending itemset size, and 
    (2) descending frequency. The FP-Growth model should be applied to 
    the DataFrame computed in the previous task. 
    
    Return value: a CSV string. As before, using toCSVLine may help.
    Test: tests/test_frequent_items.py
    '''
    spark = init_spark()
    lines = spark.read.text(filename).rdd
    parts = lines.map(lambda row: row.value.split(","))
    rdd_data = parts.map(lambda p: Row(name=p[0], items=p[1:]))
    df = spark.createDataFrame(rdd_data)
    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)
    model = fpGrowth.fit(df)
    model_1 = model.freqItemsets.orderBy([size("items"), "freq"],
                                         ascending=[0, 0])
    final_op = toCSVLine(model_1.limit(n))
    return final_op
    '''return "not implemented"'''
Exemple #22
0
def main(inputs):
    poi = spark.read.json(inputs, schema=amenity_schema)
    poi = poi.filter((poi['lon'] > -123.5) & (poi['lon'] < -122))
    poi = poi.filter((poi['lat'] > 49) & (poi['lat'] < 49.5))
    #poi = poi.coalesce(1) # ~1MB after the filtering

    transportations_data = poi.filter(poi.amenity.isin(transportations))
    schools_data = poi.filter(poi.amenity.isin(schools))
    bike_parking_data = transportations_data.filter(
        (transportations_data['amenity'] == 'bicycle_parking')
        & (functions.size('tags') > 0))
    fuel_data = transportations_data.filter(
        transportations_data['amenity'] == 'fuel')

    transportations_data.write.json('../transportations-Vancouver',
                                    mode='overwrite',
                                    compression='gzip')
    schools_data.write.json('../schools-Vancouver',
                            mode='overwrite',
                            compression='gzip')
    bike_parking_data.write.json('../bikes-Vancouver',
                                 mode='overwrite',
                                 compression='gzip')
    fuel_data.write.json('../fuel-Vancouver',
                         mode='overwrite',
                         compression='gzip')
Exemple #23
0
def pyldavis_data_format(tokenized_df, count_vectorizer, transformed,
                         lda_model):
    word_counts = tokenized_df.select((explode(
        tokenized_df.documents)).alias("words")).groupby("words").count()
    word_counts_list = {r['words']: r['count'] for r in word_counts.collect()}
    word_counts_list = [
        word_counts_list[w] for w in count_vectorizer.vocabulary
    ]

    #Create data with key-value pairs as expected by the pyLDAvis tool
    data = {
        'topic_term_dists':
        np.array(lda_model.topicsMatrix().toArray()).T,
        'doc_topic_dists':
        np.array([
            x.toArray() for x in transformed.select(
                ["topicDistribution"]).toPandas()['topicDistribution']
        ]),
        'doc_lengths': [
            r[0] for r in tokenized_df.select(size(
                tokenized_df.documents)).collect()
        ],
        'vocab':
        count_vectorizer.vocabulary,
        'term_frequency':
        word_counts_list
    }
    return data
Exemple #24
0
def count_categories(df):
    """
        [[Category:Category name]]
        [[:Category:Category name]]
        [[:File:File name]]
    """
    pattern = '\[\[:?Category:[a-zA-Z0-9.,\-!?\(\) ]+\]\]'
    return df.withColumn('n_categories', size(split(col('text'), pattern)) - 1)
Exemple #25
0
def count_unreferenced(df):
    """
        <ref>Lots of words</ref> -- reference without a link
        {{cn}} -- citation needed
    """
    pattern = '\{\{cn\}\}|<ref>[a-zA-Z0-9.,!? ]+</ref>'
    return df.withColumn('n_unreferenced',
                         size(split(col('text'), pattern)) - 1)
Exemple #26
0
def count_of_images(df):
    """
        [[File: | thumb  | upright | right | alt= | caption ]]
    """
    any_text = "[a-zA-Z0-9.,!? ]+ \] "
    pattern = "\[[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\]"
    return df.withColumn("n_images",
                         size(split(col('text'), pattern=pattern)) - 1)
Exemple #27
0
def count_items(df, parent_feature, column):
    
    name = parent_feature.get_output_columns()[0]
    
    #df = df.withColumn(name, F.lit(10))
    df = df.withColumn(name, F.size(F.split(F.col(column), r"name")) - 1)
    
    return df
Exemple #28
0
def similaryBasedOnFollowers(data, minFollowers=20, debug=False):

    # We start by renaming the user column in line with the notation
    # above.
    data = data.withColumnRenamed('follows', 'u1')

    # ==== Step 1 ====
    u1_fu1 = data.groupBy('u1').agg(F.collect_set(
        data.user).alias('fu1')).filter(F.size('fu1') >= minFollowers)

    if (debug):
        print('>> Step 1 :: u1 f(u1) <<')
        u1_fu1.show()

    # ==== Step 2 ====
    # First create a "dual" of data by renaming columns.
    # This will help the subsequent join.
    u2_fu2 = u1_fu1.withColumnRenamed('u1',
                                      'u2').withColumnRenamed('fu1', 'fu2')

    prod = u1_fu1.crossJoin(u2_fu2).filter(u1_fu1.u1 < u2_fu2.u2)

    if (debug):
        print('>> Step 2 :: u1 f(u1) u2 f(u2) <<')
        prod.show()

    # ==== Step 3 ====
    prod2 = prod.withColumn('I',
                            F.array_intersect(prod.fu1, prod.fu2)).withColumn(
                                'U',
                                F.array_union(prod.fu1,
                                              prod.fu2)).drop('fu1', 'fu2')

    if (debug):
        print('>> Step 3 :: u1 u2 I(u1,u2) U(u1,u2) <<')
        #prod2.orderBy('I',ascending=False).show()
        prod2.show()

    # ==== Step 4 ====
    result = prod2.withColumn('JI', F.size('I') / F.size('U')).drop('I', 'U')

    if (debug):
        print('>> Step 4 :: u1 u2 J(u1,u2) <<')
        result.show()
    return result
Exemple #29
0
def sdf_pooling_sequence(sdf, col=None, length=None, mode='mean'):
    if col is None:
        col = sdf.columns[0]
    if length is None:
        length = sdf.select(F.size(col).alias('length')).take(1)[0]['length']
    sdf = sdf.select([F.col(col)[i].alias(f'temp_{i}') for i in range(length)])
    sdf = eval(f"sdf.groupby().{mode}()")
    sdf = sdf.select(F.array(sdf.columns).alias(col))
    return sdf
Exemple #30
0
    def test_slice(self):
        from pyspark.sql.functions import lit, size, slice

        df = self.spark.createDataFrame([([1, 2, 3], ), ([4, 5], )], ['x'])

        self.assertEqual(
            df.select(slice(df.x, 2, 2).alias("sliced")).collect(),
            df.select(slice(df.x, lit(2), lit(2)).alias("sliced")).collect(),
        )

        self.assertEqual(
            df.select(slice(df.x,
                            size(df.x) - 1, lit(1)).alias("sliced")).collect(),
            [Row(sliced=[2]), Row(sliced=[4])])
        self.assertEqual(
            df.select(slice(df.x, lit(1),
                            size(df.x) - 1).alias("sliced")).collect(),
            [Row(sliced=[1, 2]), Row(sliced=[4])])
Exemple #31
0
def wordCount(df, colName):
    """
    Args:
        df: a DataFrame
        colName: a column for counting the number of words in it
    Returns:
        df: a DataFrame with one more column word_count of colName 
    """
    return df.withColumn('word_count', f.size(f.split(f.col(colName), ' ')))
# COMMAND ----------

from pyspark.sql.functions import split
df.select(split(col("Description"), " ")).show(2)


# COMMAND ----------

df.select(split(col("Description"), " ").alias("array_col"))\
  .selectExpr("array_col[0]").show(2)


# COMMAND ----------

from pyspark.sql.functions import size
df.select(size(split(col("Description"), " "))).show(2) # shows 5 and 3


# COMMAND ----------

from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)


# COMMAND ----------

from pyspark.sql.functions import split, explode

df.withColumn("splitted", split(col("Description"), " "))\
  .withColumn("exploded", explode(col("splitted")))\
  .select("Description", "InvoiceNo", "exploded").show(2)
Exemple #33
0
# MAGIC %md
# MAGIC Use our `removeWords` function that we registered in wiki-eda to clean up stop words.

# COMMAND ----------

sqlContext.sql("drop table if exists words")
words.registerTempTable("words")

# COMMAND ----------

noStopWords = sqlContext.sql("select removeWords(words) as words from words")  # .cache()
display(noStopWords)

# COMMAND ----------

wordVecInput = noStopWords.filter(func.size("words") != 0)
wordVecInput.count()

# COMMAND ----------

# MAGIC %md
# MAGIC Build the `Word2Vec` model.  This take about a minute with two workers.

# COMMAND ----------

from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=150, minCount=50, inputCol="words", outputCol="result", seed=0)
model = word2Vec.fit(wordVecInput)

# COMMAND ----------
Exemple #34
0
# MAGIC %md
# MAGIC Use our `removeWords` function that we registered in wiki-eda to clean up stop words.

# COMMAND ----------

sqlContext.sql('drop table if exists words')
words.registerTempTable('words')

# COMMAND ----------

noStopWords = sqlContext.sql('select removeWords(words) as words from words') #.cache()
display(noStopWords)

# COMMAND ----------

wordVecInput = noStopWords.filter(func.size('words') != 0)
wordVecInput.count()

# COMMAND ----------

# MAGIC %md
# MAGIC Build the `Word2Vec` model.  This take about a minute with two workers.

# COMMAND ----------

from pyspark.ml.feature import Word2Vec
word2Vec = Word2Vec(vectorSize=150, minCount=50, inputCol='words', outputCol='result', seed=0)
model = word2Vec.fit(wordVecInput)

# COMMAND ----------
Exemple #35
0
# MAGIC %md
# MAGIC Calculate the number of words in `noStopWords`.  Recall that each row contains an array of words.
# MAGIC  
# MAGIC One strategy would be to take the length of each row and sum the lengths.  To do this use `functions.size`, `functions.sum`, and call `.agg` on the `DataFrame`.
# MAGIC  
# MAGIC Don't forget to refer to the  [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html) and [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.package) APIs.  For example you'll find detail for the function `size` in the [functions module](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.size) in Python and the [functions package](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.functions$) in Scala.

# COMMAND ----------

# MAGIC %md
# MAGIC First, create a `DataFrame` named sized that has a `size` column with the size of each array of words.  Here you can use `func.size`.

# COMMAND ----------

# ANSWER
sized = noStopWords.withColumn('size', func.size('words'))

sizedFirst = sized.select('size', 'words').first()
print sizedFirst[0]

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(sizedFirst[0], len(sizedFirst[1]), 'incorrect implementation for sized')

# COMMAND ----------

# MAGIC %md
# MAGIC Next, you'll need to aggregate the counts.  You can do this using `func.sum` in either a `.select` or `.agg` method call on the `DataFrame`.  Make sure to give your `Column` the alias `numberOfWords`.  There are some examples in [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.GroupedData.agg) and [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.DataFrame) in the APIs.