Exemple #1
0
    def registerFunction(self, name, f, returnType=StringType()):
        """Registers a python function (including lambda function) as a UDF
        so it can be used in SQL statements.

        In addition to a name and the function itself, the return type can be optionally specified.
        When the return type is not given it default to a string and conversion will automatically
        be done.  For any other return type, the produced object must match the specified type.

        :param name: name of the UDF
        :param f: python function
        :param returnType: a :class:`pyspark.sql.types.DataType` object
        :return: a wrapped :class:`UserDefinedFunction`

        >>> strlen = spark.catalog.registerFunction("stringLengthString", len)
        >>> spark.sql("SELECT stringLengthString('test')").collect()
        [Row(stringLengthString(test)=u'4')]

        >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
        [Row(stringLengthString(text)=u'3')]

        >>> from pyspark.sql.types import IntegerType
        >>> _ = spark.catalog.registerFunction("stringLengthInt", len, IntegerType())
        >>> spark.sql("SELECT stringLengthInt('test')").collect()
        [Row(stringLengthInt(test)=4)]

        >>> from pyspark.sql.types import IntegerType
        >>> _ = spark.udf.register("stringLengthInt", len, IntegerType())
        >>> spark.sql("SELECT stringLengthInt('test')").collect()
        [Row(stringLengthInt(test)=4)]
        """
        udf = UserDefinedFunction(f, returnType, name)
        self._jsparkSession.udf().registerPython(name, udf._judf)
        return udf._wrapped()
Exemple #2
0
def get_final_uber_df(uber_df):
    """Converts json to Dataframe and puts the data in the most accessabe format"""

    roundit = UserDefinedFunction(lambda x: int(x)/2,IntegerType())
    joined = uber_df.select(roundit(uber_df["galvanize.time"]).alias("main_time"))
    for start,end in rides.items():

        df = uber_df.select(roundit(uber_df["{}.time".format(start)]).alias("time"),\
                            uber_df["{}.{}.surge_multiplier".format(start,end)][2].alias("uber_{}".format(start)))
        joined = joined.join(df,joined.main_time==df.time).drop("time")
    return joined.sort("main_time")
Exemple #3
0
    def test_udf_init_shouldnt_initialize_context(self):
        UserDefinedFunction(lambda x: x, StringType())

        self.assertIsNone(
            SparkContext._active_spark_context,
            "SparkContext shouldn't be initialized when UserDefinedFunction is created."
        )
        self.assertIsNone(
            SparkSession._instantiatedSession,
            "SparkSession shouldn't be initialized when UserDefinedFunction is created."
        )
Exemple #4
0
    def test_udf_with_udt(self):
        row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
        df = self.spark.createDataFrame([row])
        self.assertEqual(1.0, df.rdd.map(lambda r: r.point.x).first())
        udf = UserDefinedFunction(lambda p: p.y, DoubleType())
        self.assertEqual(2.0, df.select(udf(df.point)).first()[0])
        udf2 = UserDefinedFunction(lambda p: ExamplePoint(p.x + 1, p.y + 1),
                                   ExamplePointUDT())
        self.assertEqual(ExamplePoint(2.0, 3.0),
                         df.select(udf2(df.point)).first()[0])

        row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0))
        df = self.spark.createDataFrame([row])
        self.assertEqual(1.0, df.rdd.map(lambda r: r.point.x).first())
        udf = UserDefinedFunction(lambda p: p.y, DoubleType())
        self.assertEqual(2.0, df.select(udf(df.point)).first()[0])
        udf2 = UserDefinedFunction(lambda p: PythonOnlyPoint(p.x + 1, p.y + 1),
                                   PythonOnlyUDT())
        self.assertEqual(PythonOnlyPoint(2.0, 3.0),
                         df.select(udf2(df.point)).first()[0])
Exemple #5
0
 def _transform(self, dataset):
     SpacyAdvancedTokenize.setup(
         dataset._sc, dataset.sql_ctx, self.getLang())
     func = SpacyAdvancedTokenize.func(self.getLang(),
                                       self.getSpacyFields())
     retType = SpacyAdvancedTokenize.returnType(
         self.getLang(), self.getSpacyFields())
     udf = UserDefinedFunction(func, retType)
     return dataset.withColumn(
         self.getOutputCol(), udf(self.getInputCol())
     )
def benchmark(textInputPath, repeat, number):
    """
    Benchmark wordcount on a provided text input path with a given number
    of repeats each executed number times.

    :param textInputPath: The input path to perform wordcount on.
    :param repeat: Number of times to repeat the test
    :param number: Number of iterations to perform the wordcount per test
    """
    def benchmark_func(func):
        return timeit.repeat(func, repeat=repeat, number=number)

    print("Benchmarking wordcount:")
    tokenize = lambda x: x.split(" ")
    returnUDFType = ArrayType(StringType())
    # session.catalog.registerFunction does not return under spark 2.0
    from pyspark.sql.functions import UserDefinedFunction
    tokenizeUDF = UserDefinedFunction(tokenize, returnUDFType, 'split')
    session.catalog.registerFunction("split", tokenize, returnUDFType)

    tokenizeJythonUDF = session.catalog.registerJythonFunction(
        "split", tokenize, returnUDFType)
    rdd = sc.textFile(textInputPath)
    rdd.cache()
    rdd.count()
    print("RDD:")
    print(
        timeit.repeat(lambda: legacy_word_count(rdd),
                      repeat=repeat,
                      number=number))
    rdd.unpersist()
    df = session.read.text(textInputPath)
    df.cache()
    df.count()
    print("DataFrame Python UDF:")
    python_udf_test_lambda = lambda: dataframe_udf_word_count(tokenizeUDF, df)
    python_udf_times = timeit.repeat(python_udf_test_lambda,
                                     repeat=repeat,
                                     number=number)
    print(python_udf_times)
    print("DataFrame Jython UDF:")
    jython_udf_test_lambda = lambda: dataframe_udf_word_count(
        tokenizeJythonUDF, df)
    jython_udf_times = timeit.repeat(jython_udf_test_lambda,
                                     repeat=repeat,
                                     number=number)
    print(jython_udf_times)
    print("DataFrame Scala UDF:")
    scala_udf_test_lambda = lambda: dataframe_scala_udf_word_count(df)
    scala_udf_times = timeit.repeat(scala_udf_test_lambda,
                                    repeat=repeat,
                                    number=number)
    print(scala_udf_times)
Exemple #7
0
    def task1b(self):
        """1) count of reposts from subscribed and not-subscribed groups"""

        userWallPosts = self.read_parquet_file("userWallPosts.parquet")
        userGroupsSubs = self.read_parquet_file("userGroupsSubs.parquet")

        reposts_t = userWallPosts \
            .filter(userWallPosts.is_reposted) \
            .select('owner_id', 'repost_info.orig_owner_id')\
            .withColumnRenamed("owner_id", "user")

        reposts = reposts_t.filter(reposts_t["orig_owner_id"] < 0)

        user_to_group_sub = userGroupsSubs\
            .select("user", "group")\
            .groupBy("user")\
            .agg(collect_set("group"))\
            .withColumnRenamed("collect_set(group)", "groups")

        def contains(id, groups):
            if not groups:
                return False
            if str(id) in groups:
                return True
            else:
                return False

        contains_udf = UserDefinedFunction(contains)

        temp = reposts.join(user_to_group_sub, "user", how="left_outer")

        reposts_from = temp\
            .withColumn("from_subscribed", contains_udf(temp.orig_owner_id, temp.groups))

        reposts_from_subscribed = reposts_from\
            .filter(reposts_from.from_subscribed == 'true')\
            .select('user')\
            .groupBy('user')\
            .count()\
            .withColumnRenamed("count", "from_subscribed")

        reposts_not_from_subscribed = reposts_from \
            .filter(reposts_from['from_subscribed'] == 'false') \
            .select('user')\
            .groupBy("user")\
            .count()\
            .withColumnRenamed("count", "not_from_subscribed")

        result_table = reposts_from_subscribed\
            .join(reposts_not_from_subscribed, 'user', how="full_outer")\
            .fillna(0)

        return result_table
Exemple #8
0
def filter_df(df_news):
    role_codes_of_interest = ["COP", "GOV", "JUD", "BUS", "CRM", "DEV", "EDU", "ENV" \
                                "HLH", "LEG", "MED", "MNC"]

    # Filter data records that are of interest
    df_news = df_news.filter(df_news.ActionGeo_CountryCode == 'US')
    df_news = df_news.filter(df_news.Actor1Code != 'null')
    df_news = df_news.filter(df_news.Actor1Type1Code.isin(role_codes_of_interest))

    df_news.show()

    #df_news = df_news.filter(df_news.GoldsteinScale != 'null')
    #df_news = df_news.filter(df_news.GoldsteinScale != '')

    # There isn't a specific column specifying the state that the event happens in
    #   The state is tied with the country code
    #   So, parse out the state and make it a new column
    name = 'ActionGeo_ADM1Code'
    udf = UserDefinedFunction(lambda x: x[:2]+'-'+x[2:], StringType())
    df_news = df_news.select(*[udf(column).alias(name) if column == name else \
                                column for column in df_news.columns])
    split_col = F.split(df_news['ActionGeo_ADM1Code'],'-')
    df_news = df_news.withColumn('action_state', split_col.getItem(1))

    # The GoldsteinScale's range is between -10 and 10, but it's mostly sparse
    #   It's hard for a user to make use of it
    #   The scale is normalized and served as a scale between 0 to 100.
    #   Higher value denotes a more positive outcome
    name = 'GoldsteinScale'
    min_scale, max_scale = -10.0, 10.0
    norm = UserDefinedFunction(lambda x: (x - min_scale)/(max_scale - min_scale), DoubleType())
    df_news = df_news.select(*[norm(col).cast(DoubleType()) \
                .alias('normg_scale') if col == name else col for col in \
                    df_news.columns])

    df_news = df_news.filter(df_news.action_state != '')

    df_news = aggregate_job(df_news)

    return df_news
Exemple #9
0
    def test_udf_with_partial_function(self):
        d = [Row(number=i, squared=i**2) for i in range(10)]
        rdd = self.sc.parallelize(d)
        data = self.sqlCtx.createDataFrame(rdd)

        def some_func(col, param):
            if col is not None:
                return col + param

        pfunc = functools.partial(some_func, param=4)
        pudf = UserDefinedFunction(pfunc, LongType())
        res = data.select(pudf(data['number']).alias('plus_four'))
        self.assertEqual(res.agg({'plus_four': 'sum'}).collect()[0][0], 85)
Exemple #10
0
    def add_state_column(self, df_news):
        # There isn't a specific column specifying the state that the event happens in
        #   The state is tied with the country code
        #   So, parse out the state and make it a new column
        name = 'ActionGeo_ADM1Code'
        udf = UserDefinedFunction(lambda x: x[:2] + '-' + x[2:], StringType())
        df_news = df_news.select(*[
            udf(column).alias(name) if column == name else column
            for column in df_news.columns
        ])
        split_col = F.split(df_news['ActionGeo_ADM1Code'], '-')
        df_news = df_news.withColumn('action_state', split_col.getItem(1))

        return df_news
Exemple #11
0
    def test_udf_with_callable(self):
        d = [Row(number=i, squared=i**2) for i in range(10)]
        rdd = self.sc.parallelize(d)
        data = self.sqlCtx.createDataFrame(rdd)

        class PlusFour:
            def __call__(self, col):
                if col is not None:
                    return col + 4

        call = PlusFour()
        pudf = UserDefinedFunction(call, LongType())
        res = data.select(pudf(data['number']).alias('plus_four'))
        self.assertEqual(res.agg({'plus_four': 'sum'}).collect()[0][0], 85)
def user_similarities_for_collab(users):
    # vectors already normalized during preprocessing
    udf = UserDefinedFunction(lambda arr: float(arr[0].dot(arr[1])),
                              DoubleType())
    users2 = users.select(*(col(x).alias(x + '_2') for x in users.columns))
    users2.cache()
    similarities = users.join(users2, col('user_id') != col('user_id_2'))
    similarities = similarities.withColumn(
        "similarity_unnormed",
        udf(array(similarities.features, similarities.features_2)))
    mean_sim, sttdev_sim = similarities.select(
        mean("similarity_unnormed"), stddev("similarity_unnormed")).first()
    similarities = similarities.withColumn(
        "similarity", (col("similarity_unnormed") - mean_sim) / sttdev_sim)
    return similarities
Exemple #13
0
    def task4b_friends(self):
        """3) aggregate (e.g. count, max, mean) characteristics for comments and likes (separtely) made by (a) friends
            and (b) followers per user"""

        friends = self.read_parquet_file("friends.parquet")

        # userWallPosts = self.read_parquet_file("userWallPosts.parquet")
        userWallComments = self.read_parquet_file("userWallComments.parquet")
        userWallLikes = self.read_parquet_file("userWallLikes.parquet")

        user_friends = friends\
            .groupBy("profile")\
            .agg(collect_set("follower"))\
            .withColumnRenamed("collect_set(follower)", "friends")\
            .select("profile", "friends")

        comments = userWallComments.select("post_owner_id", "from_id",
                                           "post_id")

        def contains(id, groups):
            if not groups:
                return False
            if str(id) in groups:
                return True
            else:
                return False

        contains_udf = UserDefinedFunction(contains)

        post_comment_to_relation = comments\
            .withColumnRenamed("post_owner_id", "profile")\
            .join(user_friends, "profile", how="left_outer")\
            .withColumn("is_from_friend", contains_udf(col("from_id"), col("friends")))\
            .select("profile", "is_from_friend", "post_id")\
            .filter(col("is_from_friend") == "true")\

        comments_from_friends_per_post = post_comment_to_relation\
            .groupBy("post_id")\
            .count()

        result_table = post_comment_to_relation\
            .select("profile", "post_id")\
            .join(comments_from_friends_per_post, "post_id")\
            .groupBy("profile")\
            .agg(max("count"), mean("count"), sum("count"))\
            .sort(desc("sum(count)"))

        result_table.show()
Exemple #14
0
    def normalize_goldstein(self, df_news):

        # The GoldsteinScale's range is between -10 and 10, but it's mostly sparse
        # It's hard for a user to make use of it
        # The scale is normalized and served as a scale between 0 to 100.
        # Higher value denotes a more positive outcome
        name = 'GoldsteinScale'
        min_scale, max_scale = -10.0, 10.0
        norm = UserDefinedFunction(
            lambda x: (x - min_scale) / (max_scale - min_scale), DoubleType())
        df_news = df_news.select(*[
            norm(col).cast(DoubleType()).alias('normg_scale') if col ==
            name else col for col in df_news.columns
        ])

        df_news = df_news.filter(df_news.action_state != '')
Exemple #15
0
    def test_udf_defers_judf_initialization(self):
        # This is separate of  UDFInitializationTests
        # to avoid context initialization
        # when udf is called
        f = UserDefinedFunction(lambda x: x, StringType())

        self.assertIsNone(
            f._judf_placeholder,
            "judf should not be initialized before the first call.")

        self.assertIsInstance(f("foo"), Column,
                              "UDF call should return a Column.")

        self.assertIsNotNone(
            f._judf_placeholder,
            "judf should be initialized after UDF has been called.")
Exemple #16
0
def M3(
):  #3) aggregate (e.g. count, max, mean) characteristics for comments and likes (separtely) made by (a) friends (b) followers per post
    def contains(id, groups):  #Filter
        if not groups: return False
        if str(id) in groups: return True
        else: return False

    contains_udf = UserDefinedFunction(contains)

    user_friends = friends\
        .groupBy("profile")\
        .agg(collect_set("follower").alias("friends"))\
        .select("profile", "friends") #Get set of friends for an user

    comments = uWallC.select("post_owner_id", "from_id",
                             "post_id")  #Projection
    likes = uWallL.filter(uWallL.itemType=="post")\
                .selectExpr("ownerId as post_owner_id","likerId as from_id","itemId as post_id")

    #Join users with comments
    post_comment_to_relation = comments\
        .withColumnRenamed("post_owner_id", "profile")\
        .join(user_friends, "profile", "left_outer")\
        .withColumn("is_from_friend", contains_udf(col("from_id"), col("friends")))\
        .select("profile", "is_from_friend", "post_id")\
        .filter(col("is_from_friend") == "true")

    #Join users with likes
    post_comment_to_relation = likes\
        .withColumnRenamed("post_owner_id", "profile")\
        .join(user_friends, "profile", "left_outer")\
        .withColumn("is_from_friend", contains_udf(col("from_id"), col("friends")))\
        .select("profile", "is_from_friend", "post_id")\
        .filter(col("is_from_friend") == "true")

    comments_from_friends_per_post = post_comment_to_relation.groupBy("post_id")\
                                        .count()\
                                        .groupBy("post_id")\
                                        .agg(max("count").alias("cMax"), mean("count").alias("cMean"), sum("count").alias("cSum"))\

    likes_from_friends_per_post    = post_comment_to_relation.groupBy("post_id").count()\
                                        .groupBy("post_id")\
                                        .agg(max("count").alias("lMax"), mean("count").alias("lMean"), sum("count").alias("lSum"))\

    return comments_from_friends_per_post.join(likes_from_friends_per_post,
                                               "post_id")
def create_partition_columns(df):
    udf_getIntervalTime = UserDefinedFunction(getIntervalTime, StringType())

    df = df.withColumn(
        'ingestion_year',
        udf_getIntervalTime(df.creationdate, lit(IS_DATE), lit(YEAR_INDEX)))
    df = df.withColumn(
        'ingestion_month',
        udf_getIntervalTime(df.creationdate, lit(IS_DATE), lit(MONTH_INDEX)))
    df = df.withColumn(
        'ingestion_day',
        udf_getIntervalTime(df.creationdate, lit(IS_DATE), lit(DAY_INDEX)))
    df = df.withColumn(
        'ingestion_hour',
        udf_getIntervalTime(df.creationdate, lit(IS_TIME), lit(HOUR_INDEX)))

    return df
Exemple #18
0
def M1():  #1) count of reposts from subscribed and not-subscribed groups
    reposts_t = uWallP \
        .filter(uWallP.is_reposted) \
        .select('owner_id', 'repost_info.orig_owner_id')\
        .withColumnRenamed("owner_id", "user")

    reposts = reposts_t.filter(reposts_t["orig_owner_id"] < 0)

    user_to_group_sub = uGroupsS\
        .select("user", "group")\
        .groupBy("user")\
        .agg(collect_set("group"))\
        .withColumnRenamed("collect_set(group)", "groups")

    def contains(id, groups):
        if not groups: return False
        if str(id) in groups: return True
        else: return False

    contains_udf = UserDefinedFunction(contains)

    temp = reposts.join(user_to_group_sub, "user", "left_outer")

    reposts_from = temp\
        .withColumn("from_subscribed", contains_udf(temp.orig_owner_id, temp.groups))

    reposts_from_subscribed = reposts_from\
        .filter(reposts_from.from_subscribed == 'true')\
        .select('user')\
        .groupBy('user')\
        .count()\
        .withColumnRenamed("count", "from_subscribed")

    reposts_not_from_subscribed = reposts_from \
        .filter(reposts_from['from_subscribed'] == 'false') \
        .select('user')\
        .groupBy("user")\
        .count()\
        .withColumnRenamed("count", "not_from_subscribed")

    reposts_count = reposts_from_subscribed\
        .join(reposts_not_from_subscribed, 'user', "full_outer")\
        .fillna(0)

    return reposts_count
Exemple #19
0
    def _kFold(self, dataset):
        nFolds = self.getOrDefault(self.numFolds)
        foldCol = self.getOrDefault(self.foldCol)

        datasets = []
        if not foldCol:
            # Do random k-fold split.
            seed = self.getOrDefault(self.seed)
            h = 1.0 / nFolds
            randCol = self.uid + "_rand"
            df = dataset.select("*", rand(seed).alias(randCol))
            for i in range(nFolds):
                validateLB = i * h
                validateUB = (i + 1) * h
                condition = (df[randCol] >= validateLB) & (df[randCol] <
                                                           validateUB)
                validation = df.filter(condition)
                train = df.filter(~condition)
                datasets.append((train, validation))
        else:
            # Use user-specified fold numbers.
            def checker(foldNum):
                if foldNum < 0 or foldNum >= nFolds:
                    raise ValueError(
                        "Fold number must be in range [0, %s), but got %s." %
                        (nFolds, foldNum))
                return True

            checker_udf = UserDefinedFunction(checker, BooleanType())
            for i in range(nFolds):
                training = dataset.filter(
                    checker_udf(dataset[foldCol]) & (col(foldCol) != lit(i)))
                validation = dataset.filter(
                    checker_udf(dataset[foldCol]) & (col(foldCol) == lit(i)))
                if training.rdd.getNumPartitions() == 0 or len(
                        training.take(1)) == 0:
                    raise ValueError("The training data at fold %s is empty." %
                                     i)
                if validation.rdd.getNumPartitions() == 0 or len(
                        validation.take(1)) == 0:
                    raise ValueError(
                        "The validation data at fold %s is empty." % i)
                datasets.append((training, validation))

        return datasets
def remove_attachments(df, cleansed_df):
    ATTACHMENT = "attachment"

    def _remove_attachments_key(dic):
        dic_copy = dic.copy()
        for key in dic_copy:
            if (ATTACHMENT in key.lower()):
                del dic[key]
            elif (type(dic_copy[key]) == dict):
                _remove_attachments_key(dic[key])
            elif (type(dic_copy[key]) == list):
                for item in dic_copy[key]:
                    if (type(item) == dict):
                        _remove_attachments_key(item)
        return dic

    def _field_cleansing(row, field_name):
        row = row and json.loads(row)
        if row:
            field = row.get(field_name, {})
            if type(field) is dict:
                field = _remove_attachments_key(field)
            return field

    for field in df.schema.fields:
        field_type = field.dataType
        field_name = field.name
        if ATTACHMENT in field_name.lower():
            df = df.drop(field_name)
        elif field_type != StringType():
            print('Getting schea type for field {}'.format(field_name))
            structured_field_type = get_schema_type(field_name, cleansed_df)
            if structured_field_type:
                print('Everything was OK to {}'.format(field_name))
                udf_attachments_cleansing = UserDefinedFunction(
                    lambda row: _field_cleansing(row, field_name),
                    structured_field_type)
                df = df.withColumn(field_name,
                                   udf_attachments_cleansing("ToJSON"))

    return df
def to_all_users_df(ugr_rdd, ugr_df):
    client = MongoClient()
    coll = client.bgg.game_comments
    username_dict = un_pickle_user_dict()
    count = 1

    schema = StructType([
        StructField('user_id', IntegerType()),
        StructField('game_id', IntegerType()),
        StructField('rating', DoubleType())
    ])

    all_users_recs_df = spark.createDataFrame(sc.emptyRDD(), schema)
    print(all_users_recs_df)
    for username, user_id in username_dict.items():
        if count > 10:
            break
        print("current iteration:", count)
        rated_game_ids_lst = [
            x["game_id"] for x in list(coll.find({"username": username}))
        ]
        # create RDD where games are not rated which will include every entry that is not a game rated by the user_id
        user_unrated_games_rdd = ugr_rdd.filter(
            lambda x: x[1] not in rated_game_ids_lst).map(
                lambda x: (user_id, int(x[1]), x[2]))
        #create data frame
        user_unrated_df = spark.createDataFrame(user_unrated_games_rdd, schema)
        name = 'rating'
        udf = UserDefinedFunction(lambda x: 'new_value', DoubleType())
        new_test_df = user_unrated_df.select(*[
            udf(column).alias(name) if column == name else column
            for column in user_unrated_df.columns
        ])
        new_test_df = new_test_df.na.fill(0.0)
        #drop the duplicates and at this point we have a unique df for the currenty user in iteration
        unique_games_df = new_test_df.dropDuplicates(['game_id'])
        # now iterate again and use .union on current data frame, repeat and return
        all_users_recs_df = all_users_recs_df.union(unique_games_df)
        print("rows in dataframe:", all_users_recs_df.count())
        count += 1
    return all_users_recs_df
def predict_with_multiple_version(df, versions, model_date, spid):
    columns = df.columns
    for version_name in versions:
        version_infor = MODEL_VERSION_INFO[version_name]
        convmaps = get_convmap_dics(version_name, model_date)
        for k in convmaps[str(spid)].keys():
            df = df.withColumn(k + '_' + version_name, categorical_conv(convmaps[str(spid)][k])(col(k)))
        name_features = version_infor['func_feature_names'](df)
        name_features = convert_name_features(name_features, version_name, list(convmaps[str(spid)]))
        df = VectorAssembler(inputCols=name_features, outputCol='features_%s' % version_name).transform(df)
    print(df.columns)
    predicted_list = []
    for version_name in versions:
        model = get_model(version_name, spid, model_date)
        prob_col_name = 'prob_%s' % version_name
        df = df.withColumn('features', col('features_%s' % version_name))
        df = model.transform(df).withColumn(prob_col_name, UserDefinedFunction(lambda x: x.tolist()[1], DoubleType())(
            col('probability')))
        predicted_list.append(version_name)
        df = df.select(columns + ['prob_%s' % v for v in predicted_list] + ['features_%s' % v for v in versions])
    df = df.select(columns + ['prob_%s' % v for v in versions])
    return df
def preprocess_user_data(df):
    drop_list = [
        'compliment_cool', 'compliment_cute', 'compliment_hot',
        'compliment_list', 'compliment_more', 'compliment_note',
        'compliment_photos', 'compliment_plain', 'compliment_profile',
        'compliment_writer', 'compliment_funny', 'friends', 'name'
    ]
    users = df.select([c for c in df.columns if c not in drop_list])

    # get all features to long/double
    indexer = StringIndexer(inputCol='user_id', outputCol='user_id_int')
    userIndexModel = indexer.fit(users)
    users = userIndexModel.transform(users)

    users = users.withColumn("yelping_since_date",
                             unix_timestamp(col("yelping_since")))
    users = users.drop('yelping_since')

    udf = UserDefinedFunction(lambda x: len(x.split(',')) if x != '' else 0,
                              IntegerType())
    users = users.withColumn('num_years_elite', udf(users.elite))
    users = users.drop('elite')

    # assemble vector + standardize
    feat_cols = [
        c for c in users.columns if c not in ['user_id', 'user_id_int']
    ]
    assembler = VectorAssembler(inputCols=feat_cols, outputCol="vector")
    scaler = StandardScaler(inputCol="vector",
                            outputCol="features_unnormed",
                            withStd=True,
                            withMean=True)
    users = assembler.transform(users)
    users = scaler.fit(users).transform(users)
    normalizer = Normalizer(inputCol="features_unnormed", outputCol="features")
    users = normalizer.transform(users)
    users = users.select('user_id', 'user_id_int', 'average_stars', 'features')
    return users
def crear_features():

    from pyspark.sql import functions as f

    base = get_clean_data()
    udf = UserDefinedFunction(lambda x: re.sub('""', '0', str(x)),
                              StringType())
    base = base.select(*[udf(column).alias(column) for column in base.columns])

    base = base.withColumn(
        'findesemana',
        f.when(f.col('dayofweek') == 5,
               1).when(f.col('dayofweek') == 6,
                       1).when(f.col('dayofweek') == 7, 1).otherwise(0))
    base = base.withColumn(
        'quincena',
        f.when(f.col('dayofmonth') == 15,
               1).when(f.col('dayofmonth') == 14,
                       1).when(f.col('dayofmonth') == 16, 1).when(
                           f.col('dayofmonth') == 29,
                           1).when(f.col('dayofmonth') == 30, 1).when(
                               f.col('dayofmonth') == 31,
                               1).when(f.col('dayofmonth') == 1, 1).when(
                                   f.col('dayofmonth') == 2,
                                   1).when(f.col('dayofmonth') == 3,
                                           1).otherwise(0))
    #base = base.withColumn('dephour',f.when(f.length('crsdeptime')==3,f.col('crsdeptime').substr(0,1).cast("float")).otherwise(f.col('crsdeptime').substr(0,2).cast("float")) )
    base = base.withColumn('dephour',
                           f.when(f.col('dayofweek') == 5, 1).otherwise(0))
    base = base.withColumn(
        'seishoras',
        f.when(f.col('dephour') == 6,
               1).when(f.col('dephour') == 12,
                       1).when(f.col('dephour') == 18,
                               1).when(f.col('dephour') == 0, 1).otherwise(0))

    save_rds(base, "semantic.rita")
    return base
Exemple #25
0
    def task7a(self):
        """7) count of open / closed (e.g. private) groups a user participates in"""

        groupsProfiles = self.read_parquet_file("groupsProfiles.parquet")
        userGroupsSubs = self.read_parquet_file("userGroupsSubs.parquet")

        invert_id = UserDefinedFunction(lambda x: -int(x))
        user_to_group = userGroupsSubs \
            .select("user", invert_id("group")) \
            .withColumnRenamed("<lambda>(group)", "group")\
            .dropDuplicates()

        group_type = groupsProfiles\
            .select("id", "is_closed")\
            .withColumnRenamed("id", "group")\
            .dropDuplicates()

        user_to_group_type = user_to_group\
            .join(group_type, "group")\

        opened_groups = user_to_group_type\
            .filter(user_to_group_type['is_closed'] == 0)\
            .groupBy("user")\
            .count()\
            .withColumnRenamed("count", "opened")

        closed_groups = user_to_group_type\
            .filter(user_to_group_type['is_closed'] > 0)\
            .groupBy("user")\
            .count()\
            .withColumnRenamed("count", "closed")

        result_table = opened_groups\
            .join(closed_groups, "user", how="full_outer")\
            .fillna(0)

        return result_table
Exemple #26
0
def M4(
):  #4) aggregate (e.g. count, max, mean) characteristics for comments and likes (separtely) made by (a) friends and (b) followers per user
    def contains(id, groups):  #Filter
        if not groups: return False
        if str(id) in groups: return True
        else: return False

    contains_udf = UserDefinedFunction(contains)

    user_friends = friends\
        .groupBy("profile")\
        .agg(collect_set("follower").alias("friends"))\
        .select("profile", "friends") #Get set of friends for an user

    comments = uWallC.select("post_owner_id", "from_id",
                             "post_id")  #Projection

    #Join comments with user-SetOfFriends
    post_comment_to_relation = comments\
        .withColumnRenamed("post_owner_id", "profile")\
        .join(user_friends, "profile", "left_outer")\
        .withColumn("is_from_friend", contains_udf(col("from_id"), col("friends")))\
        .select("profile", "is_from_friend", "post_id")\
        .filter(col("is_from_friend") == "true")

    comments_from_friends_per_post = post_comment_to_relation.groupBy(
        "post_id").count()

    aggregates_results = post_comment_to_relation\
        .select("profile", "post_id")\
        .join(comments_from_friends_per_post, "post_id")\
        .groupBy("profile")\
        .agg(max("count"), mean("count"), sum("count"))\
        .sort(desc("sum(count)"))

    return aggregates_results
# mapping for qualification
qualificationMap = {"Graduate": 1.0, "Not Graduate": 0.0}

#mapping for property area
propertyAreaMap = {"Rural": 0.0, "Urban": 1.0}

sc = SparkContext()
sqlContext = SQLContext(sc)

data = sqlContext.read.load('processed_data.csv',
                            format='com.databricks.spark.csv',
                            header=True,
                            inferSchema='true')
data.cache()

modified_property_area = UserDefinedFunction(
    lambda m: 'Rural' if m == 'Semiurban' else m, StringType())
#print(modified_property_area)
data = data.withColumn('property_area_modified',
                       modified_property_area(data['property_area']))

marital_status_udf = UserDefinedFunction(lambda m: yes_no_map[m], DoubleType())
gender_udf = UserDefinedFunction(lambda g: genderMap.get(g, 0), DoubleType())
self_employed_udf = UserDefinedFunction(lambda se: yes_no_map.get(se, 0),
                                        DoubleType())
qualification_udf = UserDefinedFunction(lambda q: qualificationMap.get(q, 0),
                                        DoubleType())
self_employed_encode = UserDefinedFunction(
    lambda at: propertyAreaMap.get(at, 0), DoubleType())

data = data.withColumn('marital_status_encoded',marital_status_udf(data['marital_status']))\
     .withColumn('gender_encoded',gender_udf(data['gender'])) \
Exemple #28
0
 def test_udf_registration_return_type_not_none(self):
     with QuietTest(self.sc):
         with self.assertRaisesRegexp(TypeError, "Invalid returnType"):
             self.spark.catalog.registerFunction(
                 "f", UserDefinedFunction(lambda x, y: len(x) + y, StringType()), StringType())
Exemple #29
0
 def test_udf_registration_return_type_none(self):
     two_args = self.spark.catalog.registerFunction(
         "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y, "integer"), None)
     self.assertEqual(two_args.deterministic, True)
     [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
     self.assertEqual(row[0], 5)
Exemple #30
0
 def test_udf3(self):
     two_args = self.spark.catalog.registerFunction(
         "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y))
     self.assertEqual(two_args.deterministic, True)
     [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
     self.assertEqual(row[0], u'5')
Exemple #31
0
    .master("local")\
    .appName("Bank Marketing Prediction")\
    .getOrCreate()

dataset_all = spark.read.load("paper2/data/bank.csv",
                              format="com.databricks.spark.csv",
                              header="true",
                              inferSchema="true")

dataset_all.printSchema()
pd.DataFrame(dataset_all.take(5), columns=dataset_all.columns).transpose()
dataset_all.describe().toPandas().transpose()

#Normalized data start
binary_yes_no_map = {'yes': 1.0, 'no': 0.0}
toNum = UserDefinedFunction(lambda k: binary_yes_no_map[k], DoubleType())

job_map = {'admin.':0.0, 'blue-collar':1.0, 'entrepreneur':2.0, 'housemaid':3.0, \
    'management':4.0, 'retired':5.0, 'self-employed':6.0, 'services':7.0, \
    'student':8.0, 'technician':9.0, 'unemployed':10.0, 'unknown':11.0}
jobToNum = UserDefinedFunction(lambda k: job_map[k], DoubleType())

marital_map = {'divorced': 0.0, 'married': 1.0, 'single': 2.0}
maritalToNum = UserDefinedFunction(lambda k: marital_map[k], DoubleType())

education_map = {
    'primary': 0.0,
    'secondary': 1.0,
    'tertiary': 2.0,
    'unknown': 3.0
}