Ejemplo n.º 1
0
    def mediaUrls(df):
        animated_gif_urls = df.where(
            col("entities.media").isNotNull
            and array_contains(col("extended_entities.media.type"), "photo")
        ).select(
            explode(col("entities.media.media_url_https")).alias("image_url"))

        image_urls = df.where(
            col("entities.media").isNotNull
            and array_contains(col("extended_entities.media.type"), "photo")
        ).select(
            explode(col("entities.media.media_url_https")).alias("image_url"))

        video_urls = (df.where(
            col("extended_entities").isNotNull
            and col("extended_entities.media").isNotNull
            and col("extended_entities.media.video_info").isNotNull
            and array_contains(col("extended_entities.media.type"), "video")
        ).select(
            explode(col("extended_entities.media.video_info.variants")).alias(
                "video_info")).filter("video_info is not NULL").select(
                    explode(col("video_info"))).withColumn(
                        "video_url", col("col.url")).drop(col("col")))

        return image_urls.union(video_urls.union(animated_gif_urls))
Ejemplo n.º 2
0
def test_group_status(spark: SparkSession, df_group_status: DataFrame) -> None:
    from pyspark.sql import functions as F
    from pyspark.sql.types import BooleanType

    df_group_status.show()
    df_group_status.printSchema()

    df_enrich: DataFrame = df_group_status \
                    .withColumn("cond1", when(col("dt") >= to_date(lit('2020-01-01'), 'yyyy-MM-dd'), lit(True)).otherwise(lit(False))) \
                    .withColumn("cond2", when(col("dt") >= to_date(lit('2021-01-01'), 'yyyy-MM-dd'), lit(True)).otherwise(lit(False)))

    df_enrich.show()
    df_enrich.printSchema()

    df_enrich_further: DataFrame = df_enrich.groupBy("grp") \
                        .agg(F.collect_set("cond1"), F.collect_set("cond2")).toDF(*["grp", "cond1_set", "cond2_set"])

    df_enrich_further.show()
    df_enrich_further.printSchema()

    df_final: DataFrame = df_enrich_further.withColumn("from_cond1_set", ~F.array_contains(F.col("cond1_set"), False)) \
                        .withColumn("from_cond2_set", ~F.array_contains(F.col("cond2_set"), False))

    df_final.show()
    df_final.printSchema()

    df_final: DataFrame = df_final.drop(*["cond1_set", "cond2_set"])
    df_enrich: DataFrame = df_enrich.drop(*["cond1", "cond2"])

    df_enrich.join(df_final, df_enrich["grp"] == df_final["grp"],
                   "inner").show()
Ejemplo n.º 3
0
def test_array_contains(data_gen):
    arr_gen = ArrayGen(data_gen)
    lit = gen_scalar(data_gen, force_no_nulls=True)
    assert_gpu_and_cpu_are_equal_collect(lambda spark: two_col_df(
        spark, arr_gen, data_gen).select(array_contains(col('a'), lit.cast(data_gen.data_type)),
                                         array_contains(col('a'), col('b')),
                                         array_contains(col('a'), col('a')[5])), no_nans_conf)
Ejemplo n.º 4
0
    def test_np_scalar_input(self):
        import numpy as np
        from pyspark.sql.functions import array_contains, array_position

        df = self.spark.createDataFrame([([1, 2, 3], ), ([], )], ["data"])
        for dtype in [np.int8, np.int16, np.int32, np.int64]:
            self.assertEqual(df.select(lit(dtype(1))).dtypes, [("1", "int")])
            res = df.select(array_contains(df.data,
                                           dtype(1)).alias("b")).collect()
            self.assertEqual([Row(b=True), Row(b=False)], res)
            res = df.select(array_position(df.data,
                                           dtype(1)).alias("c")).collect()
            self.assertEqual([Row(c=1), Row(c=0)], res)

        # java.lang.Integer max: 2147483647
        max_int = 2147483647
        # Convert int to bigint automatically
        self.assertEqual(
            df.select(lit(np.int32(max_int))).dtypes, [("2147483647", "int")])
        self.assertEqual(
            df.select(lit(np.int64(max_int + 1))).dtypes,
            [("2147483648", "bigint")])

        df = self.spark.createDataFrame([([1.0, 2.0, 3.0], ), ([], )],
                                        ["data"])
        for dtype in [np.float32, np.float64]:
            self.assertEqual(
                df.select(lit(dtype(1))).dtypes, [("1.0", "double")])
            res = df.select(array_contains(df.data,
                                           dtype(1)).alias("b")).collect()
            self.assertEqual([Row(b=True), Row(b=False)], res)
            res = df.select(array_position(df.data,
                                           dtype(1)).alias("c")).collect()
            self.assertEqual([Row(c=1), Row(c=0)], res)
Ejemplo n.º 5
0
	def selectGenreWord(self,genreIds):
		result = None
		for gId in genreIds:
			if result == None:
				result = self.curDataWord.filter(functions.array_contains(self.curDataWord.genre,gId)).select('word','priority','searchApp','searchCount','genre').distinct()
			res = self.curDataWord.filter(functions.array_contains(self.curDataWord.genre,gId)).select('word','priority','searchApp','searchCount','genre').distinct() 			
			result = result.unionAll(res)
		return result
Ejemplo n.º 6
0
def main():
    stagemetrics = StageMetrics(spark)
    stagemetrics.begin()

    titles_df = read_csv('./data/titles.tsv')
    ratings_df = read_csv('./data/ratings.tsv')
    principals_df = read_csv('./data/principals.tsv')
    names_df = read_csv('./data/names.tsv')
    episodes_df = read_csv('./data/episodes.tsv')
    crew_df = read_csv('./data/crew.tsv')
    akas_df = read_csv('./data/akas.tsv')

    titles_df = titles_df.filter(f.array_contains(f.split(f.col('genres'), ','), 'Comedy') & (f.col('titleType') != 'short'))

    ratings_df = ratings_df.filter(f.col('averageRating') > 5.0)
    ratings_df = ratings_df.filter(f.col('numVotes').cast('int') > 8376)

    principals_df = principals_df.filter((f.col('category') != 'self') & (f.col('category') != 'cinematographer'))
    principals_df = principals_df.orderBy('tconst')
    principals_df = principals_df.filter(f.col('ordering').cast('int') <= 3)

    names_df = names_df.filter(f.col('deathYear').isNotNull())
    names_df = names_df.filter(f.array_contains(f.split(f.col('primaryProfession'), ','), 'miscellaneous'))

    akas_df = akas_df.filter(f.col('isOriginalTitle') == '1')
    akas_df = akas_df.filter(f.col('region') == 'US')

    full_movie_df = titles_df.join(akas_df, f.col('tconst') == f.col('titleId'), 'full').join(ratings_df, on=['tconst']).join(episodes_df, on=['tconst'])

    titles_grp_df = titles_df.withColumn('genres', f.explode(f.split(f.col('genres'), ','))).groupBy('genres').agg(f.count('tconst'))

    episode_mov_df = episodes_df.filter(f.col('episodeNumber').cast('int') > 10).join(titles_df, on=['tconst'])

    print(titles_df.count())
    print(ratings_df.count())
    print(principals_df.count())
    print(names_df.count())
    print(episodes_df.count())
    print(crew_df.count())
    print(akas_df.count())
    print(full_movie_df.count())
    print(episode_mov_df.count())
    print(titles_grp_df.count())

    print(titles_df.show(100))
    print(ratings_df.show(100))
    print(principals_df.show(100))
    print(names_df.show(100))
    print(episodes_df.show(100))
    print(crew_df.show(100))
    print(akas_df.show(100))
    print(full_movie_df.show(100))
    print(episode_mov_df.show(100))
    print(titles_grp_df.show(100))

    stagemetrics.end()
    stagemetrics.print_report()
Ejemplo n.º 7
0
	def selectAppIdWord(self,appIds):
		result = None
		for appId in appIds:
			if result == None:
				result = self.curDataWord.filter(functions.array_contains(self.curDataWord.searchapp,appId)).select('word','priority','searchApp','searchCount','genre').distinct()
			res = self.curDataWord.filter(functions.array_contains(self.curDataWord.searchapp,appId)).select('word','priority','searchApp','searchCount','genre').distinct()
			result = result.unionAll(res)
			word = result.select('word')
			result = result.dropna(how='any')
		return  result,word
Ejemplo n.º 8
0
def add_revert_types(wmhist, comment_column='event_comment'):
    wmhist = wmhist.withColumn("revert_tools_match",match_comment(f.col(comment_column),f.col("wiki_db"),f.col("event_timestamp")))
    wmhist = wmhist.withColumn("is_undo", f.array_contains(col='revert_tools_match',value='undo'))
    wmhist = wmhist.withColumn("is_rollback", f.array_contains(col='revert_tools_match',value='rollback'))

    tool_priority = ['huggle','twinkle','fastbuttons','LiveRC','rollback','undo']
    tool_column_names = ["revert_tool_{0}".format(tool) for tool in tool_priority]
    for tool, tool_column_name in zip(tool_priority, tool_column_names):
        wmhist = wmhist.withColumn(tool_column_name, f.when(f.array_contains(f.col("revert_tools_match"),tool),tool).otherwise(None))

        wmhist = wmhist.withColumn("revert_tool",f.coalesce(*tool_column_names))
        wmhist = wmhist.fillna('otherTool',subset=['revert_tool'])

        return wmhist
def random_text_classifier(input_loc, output_loc):
    """
    This is a dummy function that mocks the following steps:

        1. clean input data (tokenization, remove stop words)
        2. use a pre-trained model to make prediction 
        3. write predictions to a HDFS output

    Naively marks reviews having the text "good" as positive and
    the rest as negative 
    """

    # read input
    df_raw = spark.read.option("header", True).csv(input_loc)
    # perform text cleaning

    # Tokenize text
    tokenizer = Tokenizer(inputCol='review_str', outputCol='review_token')
    df_tokens = tokenizer.transform(df_raw).select('cid', 'review_token')

    # Remove stop words
    remover = StopWordsRemover(inputCol='review_token',
                               outputCol='review_clean')
    df_clean = remover.transform(df_tokens).select('cid', 'review_clean')

    # function to check presence of good and naively assume its a positive review
    df_out = df_clean.select(
        'cid',
        array_contains(df_clean.review_clean, "good").alias('positive_review'))

    df_out.write.mode("overwrite").parquet(output_loc)
def random_text_classifier(input_loc, output_loc):
    """
    This is a dummy function to show how to use spark.
    It is supposed to mock the following steps:
        1. clean input data
        2. use a pre-trained model to make a prediction 
        3. write predictions to a HDFS output

    Since this is meant as an example, we are going to skip building a model,
    instead we are naively going to mark reviews having the text "good" as positive and
    the rest as negative 
    """

    # read input
    df_raw = spark.read.option("header", True).csv(input_loc)

    # Perform text cleaning
    # tokenize text
    tokenizer = Tokenizer(inputCol="review_str", outputCol="review_token")
    df_tokens = tokenizer.transform(df_raw).select("cid", "review_token")

    # remove stop words
    remover = StopWordsRemover(inputCol="review_token",
                               outputCol="review_clean")
    df_clean = remover.transform(df_tokens).select("cid", "review_clean")

    # now check presence of "good" and naively classify as positive review
    df_out = df_clean.select(
        "cid",
        array_contains(df_clean.review_clean, "good").alias("positive_review"))
    df_out.write.mode("overwrite").parquet(output_loc)
Ejemplo n.º 11
0
def outputTopRated(metadata, reviews, category):
    '''
	Input: metadata and reviews collections, and category name
	Output: Top 1 product in a certain category
	'''

    cate_filtered = metadata.filter(
        func.array_contains(metadata["categories"], category))

    #join cate_filtered dataframe with reviews collection
    #and select id, title and overall columns
    inner_join = cate_filtered.join(reviews,
                                    cate_filtered.asin == reviews.asin).select(
                                        cate_filtered['asin'], 'title',
                                        'overall')

    #map each row to a list
    map_join = inner_join.rdd.map(list)

    #map: (id, title) is key, (rating, 1) is value
    #reduce: sum all ratings and all 1s (which is gonna be num of reviews)
    counts = map_join.map(lambda x: ((x[0], x[1]), (x[2], 1))).reduceByKey(
        lambda a, b: (a[0] + b[0], a[1] + b[1])).sortBy(lambda x: x[1])
    counts = counts.map(
        lambda x: [category, x[0][1], x[1][0], x[1][1]]).sortBy(
            lambda x: float(x[2]) / float(x[3]), ascending=False)
    countsDF = counts.toDF()

    #get the products with num of reviews > 100
    countsDF_filtered = countsDF.filter(countsDF[3] > 100)

    return countsDF_filtered.limit(1)
Ejemplo n.º 12
0
    def test_array_contains_function(self):
        from pyspark.sql.functions import array_contains

        df = self.spark.createDataFrame([(["1", "2", "3"], ), ([], )],
                                        ['data'])
        actual = df.select(array_contains(df.data, "1").alias('b')).collect()
        self.assertEqual([Row(b=True), Row(b=False)], actual)
Ejemplo n.º 13
0
def random_text_classifier(input_loc, output_loc):
    """
    This is a dummy function to show how to use spark, It is supposed to mock
    the following steps
        1. clean input data
        2. use a pre-trained model to make prediction 
        3. write predictions to a HDFS output

    Since this is meant as an example, we are going to skip building a model,
    instead we are naively going to mark reviews having the text "good" as positive and
    the rest as negative 
    """

    # read input
    df_raw = spark.read.option("header", True).csv(input_loc)
    # perform text cleaning

    # Tokenize text
    tokenizer = Tokenizer(inputCol='review_str', outputCol='review_token')
    df_tokens = tokenizer.transform(df_raw).select('cid', 'review_token')

    # Remove stop words
    remover = StopWordsRemover(inputCol='review_token',
                               outputCol='review_clean')
    df_clean = remover.transform(df_tokens).select('cid', 'review_clean')

    # function to check presence of good
    df_out = df_clean.select(
        'cid',
        array_contains(df_clean.review_clean, "good").alias('positive_review'))
    # parquet is a popular column storage format, we use it here
    df_out.write.mode("overwrite").parquet(output_loc)
Ejemplo n.º 14
0
def main():
    Spark = get_Spark_Session("Json_reabd")
    S=SparkSession.builder.getOrCreate()
    schema=StructType([StructField("name",StringType()),StructField("age",IntegerType()),StructField("cars",StructType
    ([StructField("car1", StringType()) ,StructField("car2", StringType()),StructField("car3", StringType())]))])
    Json_Df=Spark.read.option("multiline","true").schema(schema).json("d:/Nested_Json.json")
    Json_Df1=Json_Df
    Json_Df1.show()
    for col_name in Json_Df.columns:
        print col_name

    for i in Json_Df.select("cars.*").columns:
        c_name="cars."+i
        Json_Df=  Json_Df.withColumn(i,col=F.col(c_name))

    structureSchema = StructType().add("id", StringType()).add("dept", StringType()).add("properties",   StructType().add("salary", IntegerType()).add("location", StringType()))
    print (structureSchema)
    Json_Df6=Spark.read.option("multiline","true").schema(schema).json("d:/Nested_Json.json")
    l=[]
    for i in  Json_Df6.select("cars.*").schema.names:
        l.append(F.col(i))

    print l
    Json_Df6.createOrReplaceTempView("god")
    print Spark.sql("select   name,age,array(cars.*) as dd from god").filter(array_contains(F.col("dd"),"BMW")).show()
Ejemplo n.º 15
0
 def animatedGifUrls(df):
     return df.where(
         col("extended_entities").isNotNull
         and col("extended_entities.media").isNotNull and array_contains(
             col("extended_entities.media.type"), "animated_gif")).select(
                 explode(
                     col("extended_entities.media.media_url_https")).alias(
                         "animated_gif_url"))
def defineHeuristic4Miners(dfvj):
    p = list(pools)

    test = dfvj.where(size(col("vj_dest_address")) > 100).where(
        array_contains(col("vj_dest_address"), p[0]))
    for pool in p:
        tmp = dfvj.where(size(col("vj_dest_address")) > 100).where(
            array_contains(col("vj_dest_address"), pool))
        test = test.unionAll(tmp)

    tmp = test.selectExpr("vj_dest_address as a").collect()

    nonTrivialMiners = []
    for row in tmp:
        nonTrivialMiners += row.a
    nonTrivialMiners = set(nonTrivialMiners)

    return nonTrivialMiners
Ejemplo n.º 17
0
 def videoUrls(df):
     return (df.where(
         col("extended_entities").isNotNull
         and col("extended_entities.media").isNotNull
         and col("extended_entities.media.video_info").isNotNull
         and array_contains(col("extended_entities.media.type"), "video")
     ).select(
         explode(col("extended_entities.media.video_info.variants")).alias(
             "video_info")).filter("video_info is not NULL").select(
                 explode(col("video_info"))).withColumn(
                     "video_url", col("col.url")).drop(col("col")))
Ejemplo n.º 18
0
 def group_tweets_from_hashtag_by_hour(self, hashtag):
     tweetsByHashtag = self.__tweets_df.where(
         array_contains(self.__tweets_df.hashtags, hashtag))
     result = tweetsByHashtag\
         .groupBy(\
             hour('created_at')\
                 .alias('created_at_hour')\
         )\
         .count()\
         .orderBy('created_at_hour')
     return parse_json_response(result.toJSON().collect())
Ejemplo n.º 19
0
def explode_sampling(df):
    base_columns = df.columns
    columns = list(map(lambda c: split(col(c), args.split_char).alias(c) if c == args.class_col else col(c), base_columns))
    with_category_array = df.select(*columns)
    df = df.select(columns)
    classes = list(map(lambda row: row[args.class_col], df.select(explode(args.class_col).alias(args.class_col)).distinct().collect()))

    if seed:
        sample = with_category_array.filter(array_contains(with_category_array[args.class_col], classes[0])).sample(fraction, seed)
    else:
        sample = with_category_array.filter(array_contains(with_category_array[args.class_col], classes[0])).sample(fraction)

    for clazz in classes[1:]:
        if seed :
            sample = sample.union(with_category_array.filter(array_contains(with_category_array[args.class_col], clazz)).sample(fraction, seed))
        else:
            sample = sample.union(with_category_array.filter(array_contains(with_category_array[args.class_col], clazz)).sample(fraction))

    select = list(map(lambda c: concat_ws(";", col(c)).alias(c) if c == args.class_col else col(c), base_columns))
    return sample.select(select), len(classes)
def process_text(df: DataFrame) -> DataFrame:
    """Process features extracted from text fields"""
    df = df.withColumn(
        "flag_energy_title",
        sf.array_contains("title_text_features",
                          "energy").astype(IntegerType()))
    df = df.withColumn(
        "flag_energy_abstract",
        sf.array_contains("abstract_text_features",
                          "energy").astype(IntegerType()))
    df = df.withColumn(
        "flag_energy_claims",
        sf.array_contains("claims_text_features",
                          "energy").astype(IntegerType()))
    feature_cols = [
        "english_text_features", "flag_energy_title", "flag_energy_abstract",
        "flag_energy_claims"
    ]
    df = df.select("_file", *feature_cols)
    return df
Ejemplo n.º 21
0
 def get_most_followed_users(self, hashtag, limit=5):
     tweetsByHashtag = self.__tweets_df.where(
         array_contains(self.__tweets_df.hashtags, hashtag)).alias('tweet')
     users = self.__users_df.alias('user')
     usersByHashtag = users.join(tweetsByHashtag,
                                 users.user_id == tweetsByHashtag.user_id)
     result = usersByHashtag\
         .select(users.user_id, users.screen_name, users.followers_count)\
         .orderBy(users.updated_at.desc())\
         .dropDuplicates(['user_id'])
     result = result.orderBy(result.followers_count.desc()).limit(limit)
     return parse_json_response(result.toJSON().collect())
Ejemplo n.º 22
0
def createDict(df, all_plants):
    dict_list = [()]
    for state in states:
        plant_names = df.select(df.plant_name).where(
            array_contains(df.states,
                           state)).rdd.flatMap(lambda x: x).collect()
        dict1 = dict([(plant_name, 1) if plant_name in plant_names else
                      (plant_name, 0) for plant_name in all_plants])
        tuple_data = (state, dict1)
        dict_list.append(tuple_data)
    rdd = sc.parallelize(dict_list[1:])
    return rdd
def process_ipcr(df: DataFrame) -> DataFrame:
    """
    Generates a flag column for each section and combination of section/class of the patents indicating if the patent
    is part of this categorization
    """
    col = "bibliographic-data_classifications-ipcr_classification-ipcr"
    df = df.withColumn("ipcr_values", extract_ipcr(sf.col(col)))
    df = df.withColumn("ipcr_sections", sf.col("ipcr_values.sections"))
    df = df.withColumn("ipcr_sections_class",
                       sf.col("ipcr_values.sections_class"))
    for section in SECTIONS_IPCR:
        df = df.withColumn(
            f"section_{section}",
            sf.array_contains(sf.col("ipcr_sections"),
                              section).astype(IntegerType()))

    for section_class in SECTIONS_CLASS_IPCR:
        df = df.withColumn(
            f"section_class_{section_class}",
            sf.array_contains(sf.col("ipcr_sections_class"),
                              section_class).astype(IntegerType()))
    return df
Ejemplo n.º 24
0
    def __init__(self):
        super(FeatureJsTotal, self).__init__()

        self.group_by_aggs = {
            'js_count':
            F.count(F.when(
                F.col('is_js') == True,  # noqa
                F.col('is_js')))
        }
        self.pre_group_by_calcs = {
            'is_js':
            F.array_contains(F.split(F.col('content_type'), '/'), 'javascript')
        }
def customFunction(row):
    #person = rdd.map(lambda r: row(*r))
    #temp_df = sqlContext.createDataFrame(person)
    print "reaching\n", row['categories']
    t = str(row['categories'])
    #review_df.createTempView("rev")
    #temp_df = review_df.filter(review_df["business_id"]==row["business_id"])
    temp_df = bus_df.withColumn('cat_true', func.array_contains(bus_df['categories'], t))
    #temp_df = bus_df.where((t in bus_df["categories"]))
    #temp_df = sqlContext.sql("SELECT * FROM rev where rev['business_id']=t")
    #print (temp_df)
    df_x = temp_df.filter(temp_df.cat_true == True).drop('cat_true')
    df_x.write.json("/Users/apple/Desktop/dataset/businesses/"+str(t)+".json")
        def one_hot_encode_top_n_tags(dataframe,n):
            """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present.

            :param dataframe: the PySpark dataframe 
            :param n: the number of the top ranked tags to return as tag fields
            :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe
            """
            top_n = [t.tag for t in df_tag_freq.orderBy(desc("frequency")).select("tag").limit(n).collect()]
            for tag in top_n:
                # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names
                tag_column_name = ("tag_"+tag).replace(".","dot")
                dataframe = dataframe.withColumn(tag_column_name, array_contains(dataframe.tags_split, tag).cast("int"))
            return dataframe
Ejemplo n.º 27
0
    def __init__(self):
        super(FeatureImageTotal, self).__init__()

        self.group_by_aggs = {
            'image_count':
            F.count(
                F.when(
                    F.col('is_image') == True,  # noqa
                    F.col('is_image')))
        }
        self.pre_group_by_calcs = {
            'is_image':
            F.array_contains(F.split(F.col('content_type'), '/'), 'image')
        }
Ejemplo n.º 28
0
    def transform(self, df):

        embeddings = df \
            .select('embedding_document') \
            .filter(~array_contains('embedding_document', np.nan)) \
            .toPandas()['embedding_document'] \
            .to_list()

        embeddings = np.array(embeddings, dtype=np.float32)

        model = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine')

        model.fit(embeddings)

        return model
Ejemplo n.º 29
0
    def transform(self, data):
        df_article, df_clustering, df_embeddings = data

        df_article_topic = df_article \
            .join(df_clustering, on='url_id') \
            .join(df_embeddings, on='url_id') \
            .select('url_id', 'time', 'header', 'tags', 'topic_id', 'embedding_document') \
            .filter(col('topic_id') != -1) \
            .filter(~array_contains('embedding_document', np.nan)) \
            .orderBy('time') \
            .toPandas()

        n_articles = 50000
        df_article_topic = df_article_topic.iloc[-n_articles:]

        return df_article_topic
        def one_hot_encode_top_n_tags(dataframe, n):
            """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present.

            :param dataframe: the PySpark dataframe 
            :param n: the number of the top ranked tags to return as tag fields
            :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe
            """
            top_n = [
                t.tag for t in df_tag_freq.orderBy(desc("frequency")).select(
                    "tag").limit(n).collect()
            ]
            for tag in top_n:
                # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names
                tag_column_name = ("tag_" + tag).replace(".", "dot")
                dataframe = dataframe.withColumn(
                    tag_column_name,
                    array_contains(dataframe.tags_split, tag).cast("int"))
            return dataframe
Ejemplo n.º 31
0
    def __init__(self):
        super(FeatureImageToHtmlRatio, self).__init__()

        self.group_by_aggs = {
            'html_count':
            F.count(
                F.when(F.col('is_html') == True, F.col('is_html'))  # noqa
            ),
            'image_count':
            F.count(
                F.when(F.col('is_image') == True, F.col('is_image'))  # noqa
            )
        }
        self.pre_group_by_calcs = {
            'is_html':
            F.col('content_type') == 'text/html',
            'is_image':
            F.array_contains(F.split(F.col('content_type'), '/'), 'image')
        }
Ejemplo n.º 32
0
    def test_array_contains_function(self):
        from pyspark.sql.functions import array_contains

        df = self.spark.createDataFrame([(["1", "2", "3"],), ([],)], ['data'])
        actual = df.select(array_contains(df.data, "1").alias('b')).collect()
        self.assertEqual([Row(b=True), Row(b=False)], actual)
# COMMAND ----------

df.select(split(col("Description"), " ").alias("array_col"))\
  .selectExpr("array_col[0]").show(2)


# COMMAND ----------

from pyspark.sql.functions import size
df.select(size(split(col("Description"), " "))).show(2) # shows 5 and 3


# COMMAND ----------

from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)


# COMMAND ----------

from pyspark.sql.functions import split, explode

df.withColumn("splitted", split(col("Description"), " "))\
  .withColumn("exploded", explode(col("splitted")))\
  .select("Description", "InvoiceNo", "exploded").show(2)


# COMMAND ----------

from pyspark.sql.functions import create_map
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
Ejemplo n.º 34
0
    def augment(df):
        if 'addons' in df.columns:
            df = df.select(['*'] + [create_get_addon_name_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__')) for addon in all_addons] + [create_get_addon_version_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__') + '-version') for addon in all_addons])

        if 'json_dump' in df.columns:
            df = df.select(['*'] + [functions.array_contains(df['json_dump']['modules']['filename'], module_name).alias(module_id) for module_id, module_name in module_ids.items()])

        if 'plugin_version' in df.columns:
            df = df.withColumn('plugin', df['plugin_version'].isNotNull())

        if 'app_notes' in df.columns:
            df = df.select(['*'] + [(functions.instr(df['app_notes'], app_note.replace('__DOT__', '.')) != 0).alias(app_note) for app_note in all_app_notes] + [(functions.instr(df['app_notes'], 'Has dual GPUs') != 0).alias('has dual GPUs')])

        if 'graphics_critical_error' in df.columns:
            df = df.select(['*'] + [(functions.instr(df['graphics_critical_error'], error.replace('__DOT__', '.')) != 0).alias(error) for error in all_gfx_critical_errors])

        if 'total_virtual_memory' in df.columns and 'platform_version' in df.columns and 'platform' in df.columns:
            def get_arch(total_virtual_memory, platform, platform_version):
                if total_virtual_memory:
                    try:
                        if int(total_virtual_memory) < 2684354560:
                            return 'x86'
                        else:
                            return 'amd64'
                    except:
                        return 'unknown'
                elif platform == 'Mac OS X':
                    return 'amd64'
                else:
                    if 'i686' in platform_version:
                        return 'x86'
                    elif 'x86_64' in platform_version:
                        return 'amd64'

            get_arch_udf = functions.udf(get_arch, StringType())

            df = df.withColumn('os_arch', get_arch_udf(df['total_virtual_memory'], df['platform'], df['platform_version']))

        if 'adapter_driver_version' in df.columns:
            def get_driver_version(adapter_vendor_id, adapter_driver_version):
                # XXX: Sometimes we have a driver which is not actually made by the vendor,
                #      in those cases these rules are not valid (e.g. 6.1.7600.16385).
                if adapter_driver_version:
                    if adapter_vendor_id == '0x8086' or adapter_vendor_id == '8086':
                        return adapter_driver_version[adapter_driver_version.rfind('.') + 1:]
                    elif adapter_vendor_id == '0x10de' or adapter_vendor_id == '10de':
                        return adapter_driver_version[-6:-5] + adapter_driver_version[-4:-2] + '.' + adapter_driver_version[-2:]
                    # TODO: AMD?

                return adapter_driver_version

            get_driver_version_udf = functions.udf(get_driver_version, StringType())

            df = df.withColumn('adapter_driver_version_clean', get_driver_version_udf(df['adapter_vendor_id'], df['adapter_driver_version']))

        if 'cpu_info' in df.columns:
            df = df.withColumn('CPU Info', functions.substring_index(df['cpu_info'], ' | ', 1))
            df = df.withColumn('Is Multicore', functions.substring_index(df['cpu_info'], ' | ', -1) != '1')

        if 'dom_ipc_enabled' in df.columns:
            df = df.withColumnRenamed('dom_ipc_enabled', 'e10s_enabled')

        if 'memory_ghost_windows' in df.columns:
            df = df.withColumn('ghost_windows > 0', df['memory_ghost_windows'] > 0)

        if 'memory_top_none_detached' in df.columns:
            df = df.withColumn('top(none)/detached > 0', df['memory_top_none_detached'] > 0)

        return df