Python first Exemples, pyspark.sql.functions.first Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_functions.py Projet : apache/spark

 def test_first_last_ignorenulls(self):
     from pyspark.sql import functions
     df = self.spark.range(0, 100)
     df2 = df.select(functions.when(df.id % 3 == 0, None).otherwise(df.id).alias("id"))
     df3 = df2.select(functions.first(df2.id, False).alias('a'),
                      functions.first(df2.id, True).alias('b'),
                      functions.last(df2.id, False).alias('c'),
                      functions.last(df2.id, True).alias('d'))
     self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect())

Exemple #2

0

Afficher le fichier

Fichier : stream_ohlc.py Projet : tansinee/poc01

def reduce_to_ohlc(time, rdd):
    row_rdd = rdd.map(lambda row: row.split(',')) \
                 .filter(lambda row: len(row) == 3) \
                 .map(lambda row: Row(
                       symbol=row[0],
                       tx_time=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'),
                       price=float(row[1])
                 ))
    sql_context = get_sql_context_instance(rdd.context)
    data = sql_context.createDataFrame(row_rdd)
    data.cache()
    data.write.format('org.apache.spark.sql.cassandra') \
            .options(table='transactions2', keyspace='stock', cluster='Test Cluster') \
            .mode('append') \
            .save()

    ohlc = data.select('symbol', truncate_min(data.tx_time).alias('batch_time'), 'price', 'tx_time') \
                .orderBy('tx_time') \
                .groupBy('symbol', 'batch_time') \
                .agg(
                   F.first(data.price).alias('open'),
                   F.max(data.price).alias('high'),
                   F.min(data.price).alias('low'),
                   F.last(data.price).alias('close'),
                   F.first(data.tx_time).alias('open_time'),
                   F.last(data.tx_time).alias('close_time')
                )

    existing_ohlc = sql_context.read.format('org.apache.spark.sql.cassandra') \
            .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
            .load() \
            .select('symbol', 'batch_time', 'open', 'open_time', 'high', 'low', 'close', 'close_time')

    merged_ohlc = ohlc.join(existing_ohlc,
                             (ohlc.symbol == existing_ohlc.symbol) &
                             (ohlc.batch_time == existing_ohlc.batch_time),
                             'left'
                           )

    merged_ohlc = merged_ohlc.select(
        ohlc.symbol.alias('symbol'),
        ohlc.batch_time.alias('batch_time'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open).otherwise(ohlc.open).alias('open'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open_time).otherwise(ohlc.open_time).alias('open_time'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close).otherwise(ohlc.close).alias('close'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close_time).otherwise(ohlc.close_time).alias('close_time'),
        F.when(existing_ohlc.low < ohlc.low, existing_ohlc.low).otherwise(ohlc.low).alias('low'),
        F.when(existing_ohlc.high > ohlc.high, existing_ohlc.high).otherwise(ohlc.high).alias('high')
    )
    merged_ohlc.write.format('org.apache.spark.sql.cassandra') \
                .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
                .mode('append') \
                .save()

Exemple #3

0

Afficher le fichier

Fichier : tests.py Projet : Liuchang0812/spark

    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions
        self.assertEqual((0, u'99'),
                         tuple(g.agg(functions.first(df.key), functions.last(df.value)).first()))
        self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0])
        self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])

Exemple #4

0

Afficher le fichier

Fichier : distancecluster.py Projet : xuliangleon/Optimus

    def levenshtein_cluster(df, col_name):
        # Prepare a group so we don need to apply the fingerprint to the whole data set
        df = df.select(col_name).groupby(col_name).agg(F.count(col_name).alias("count"))

        df = KeyCollision.fingerprint(df, col_name)

        df_t = df.groupby(col_name + "_FINGERPRINT").agg(F.collect_list(col_name).alias("cluster"),
                                                         F.size(F.collect_list(col_name)).alias("cluster_size"),
                                                         F.first(col_name).alias("recommended"),
                                                         F.sum("count").alias("count"))

        # Filter min distance
        df_l = DistanceCluster.levenshtein_filter(df, col_name)

        # Cluster
        df_l = df_l.join(df_t, (df_l[col_name + "_FROM"] == df_t[col_name + "_FINGERPRINT"]), how="left") \
            .cols.drop(col_name + "_FINGERPRINT") \
            .cols.drop([col_name + "_FROM", col_name + "_TO", col_name + "_LEVENSHTEIN_DISTANCE"]).table()

        return df_l

Exemple #5

0

Afficher le fichier

def saveCompletedJobRunScheduleData(microBatchDF):
    scheduleExplodeDF = microBatchDF.select(microBatchDF.job_id,
                                            microBatchDF.run_id,
                                            explode(microBatchDF.schedule))
    scheduleDF = scheduleExplodeDF.groupBy("job_id",
                                           "run_id").pivot("key").agg(
                                               first("value"))

    if DeltaTable.isDeltaTable(spark, completed_job_run_schedule_path):
        # merge data
        deltaTable = DeltaTable.forPath(spark, completed_job_run_schedule_path)

        (deltaTable.alias("target").merge(
            scheduleDF.alias("source"),
            "source.job_id=target.job_id and source.run_id=target.run_id").
         whenMatchedUpdateAll().whenNotMatchedInsertAll().execute())

    else:
        (scheduleDF.write.format("delta").mode("overwrite").option(
            "mergeSchema", "true").save(completed_job_run_schedule_path))

Exemple #6

0

Afficher le fichier

def statistic_school_address(df):
    """
    学校和工作地对应的分析
    :param df: 
    :return: 
    """
    df = df.filter(df.address.isNotNull())
    groups = ("school_name", "degree", "address")
    df = add_median_salary(df, groups)
    sda_df = df.groupby(*groups).agg(
        F.count("*").alias("person_num"),
        F.first("avg_salary").alias("avg_salary"))
    sda_df = sda_df.filter(sda_df.person_num > MIN_NUM)
    # 不限degree分析
    sa_df = sda_df.groupby("school_name", "address").agg(
        F.sum("person_num").alias("person_num"),
        F.avg("avg_salary").alias("avg_salary"))
    sa_df = sa_df.withColumn("degree", F.lit(NA))
    sda_df = sda_df.unionByName(sa_df)
    return sda_df

Exemple #7

0

Afficher le fichier

    def deduplication(logger, df_dict: Dict[str, DataFrame], rules: Dict[str, List[str]]):
        """
        Deduplicate lines considering few columns and merge data from those duplicate
        Args:
            logger: Logger instance used to log events
            df_dict: Dictionary of the datasets with the structure {Name: Dataframe}
            rules: {Dataset Name: [column1, column2]

        Returns: Dic updated in place

        """
        try:
            for df_name, columns in rules.items():
                df_dict[df_name] = df_dict.get(df_name).groupBy(*columns) \
                    .agg(
                    *[first(x, ignorenulls=True).alias(x) for x in df_dict.get(df_name).columns if x not in columns])
            logger.info("Dataframes cleaning deduplication applied")
        except Exception as e:
            logger.error("Cleaning duplicate rows couldn't be performed: {}".format(e), traceback.format_exc())
            raise e

Exemple #8

0

Afficher le fichier

def main():
    spark = SparkSession.builder.master("local").appName("Word Count").config(
        "spark.some.config.option", "some-value").getOrCreate()
    # sc = SparkContext()
    l = [(None, 1), ('Aliceaa', 3), ('Alices', None), ('Alicesssss', 1),
         ('Alices', 3)]
    x = spark.createDataFrame(l, ['name', 'age'])

    def myFunc(data_list):
        for val in data_list:
            if val is not None and val != '':
                return val
        return None

    myUdf = udf(myFunc, StringType())

    x=x.groupBy('age')\
        .agg(first('name').alias('name'))
    # dropping duplicates from the dataframe
    x.dropDuplicates().show()

Exemple #9

0

Afficher le fichier

Fichier : tests.py Projet : listwebit/data

    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100],
                         sorted(
                             g.agg({
                                 'key': 'max',
                                 'value': 'count'
                             }).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions
        self.assertEqual((0, u'99'),
                         tuple(
                             g.agg(functions.first(df.key),
                                   functions.last(df.value)).first()))
        self.assertTrue(
            95 < g.agg(functions.approxCountDistinct(df.key)).first()[0])
        self.assertEqual(100,
                         g.agg(functions.countDistinct(df.value)).first()[0])

Exemple #10

0

Afficher le fichier

    def group_batched_logs(df_logs):
        # group logs from did + interval_time + keyword.
        # group 1: group by did + interval_starting_time + keyword
        df = df_logs.groupBy(
            'aid', 'interval_starting_time', 'keyword_index').agg(
                first('keyword').alias('keyword'),
                first('age').alias('age'),
                first('gender_index').alias('gender_index'),
                first('aid_bucket').alias('aid_bucket'),
                fn.sum(col('is_click')).alias('kw_clicks_count'),
                fn.sum(fn.when(col('is_click') == 0,
                               1).otherwise(0)).alias('kw_shows_count'),
            )
        # df = df.orderBy('keyword_index')
        df = df.withColumn(
            'kwi_clicks_count',
            concat_ws(":", col('keyword_index'), col('kw_clicks_count')))
        df = df.withColumn(
            'kwi_shows_count',
            concat_ws(":", col('keyword_index'), col('kw_shows_count')))
        df = df.withColumn(
            'kw_clicks_count',
            concat_ws(":", col('keyword'), col('kw_clicks_count')))
        df = df.withColumn(
            'kw_shows_count',
            concat_ws(":", col('keyword'), col('kw_shows_count')))
        # group 2: group by did + interval_starting_time
        df = df.groupBy('aid', 'interval_starting_time').agg(
            concat_ws(",", collect_list('keyword_index')).alias('kwi'),
            concat_ws(
                ",",
                collect_list('kwi_clicks_count')).alias('kwi_click_counts'),
            concat_ws(
                ",", collect_list('kwi_shows_count')).alias('kwi_show_counts'),
            concat_ws(",", collect_list('keyword')).alias('interval_keywords'),
            concat_ws(
                ",", collect_list('kw_clicks_count')).alias('kw_click_counts'),
            concat_ws(",",
                      collect_list('kw_shows_count')).alias('kw_show_counts'),
            first('age').alias('age'),
            first('gender_index').alias('gender_index'),
            first('aid_bucket').alias('aid_bucket'))

        return df

Exemple #11

0

Afficher le fichier

def search_clients_daily(main_summary):
    return agg_search_data(
        main_summary,
        [
            'client_id',
            'submission_date',
            'engine',
            'source',
        ],
        map(agg_first, [
            'country',
            'app_version',
            'distribution_id',
            'locale',
            'search_cohort',
            'addon_version',
            'os',
            'channel',
            'profile_creation_date',
            'default_search_engine',
            'default_search_engine_data_load_path',
            'default_search_engine_data_submission_url',
            'sample_id',
        ]) + [
            # Count of 'first' subsessions seen for this client_day
            (count(when(col('subsession_counter') == 1,
                        1)).alias('sessions_started_on_this_day')),
            first(
                datediff(
                    'subsession_start_date',
                    from_unixtime(col('profile_creation_date') * 24 * 60 *
                                  60))).alias('profile_age_in_days'),
            sum(col('subsession_length') /
                3600.0).alias('subsession_hours_sum'),
            mean(size('active_addons')).alias('active_addons_count_mean'),
            (max('scalar_parent_browser_engagement_max_concurrent_tab_count').
             alias('max_concurrent_tab_count_max')),
            (sum('scalar_parent_browser_engagement_tab_open_event_count').
             alias('tab_open_event_count_sum')),
            (sum(col('active_ticks') * 5 / 3600.0).alias('active_hours_sum')),
        ])

Exemple #12

0

Afficher le fichier

def statistic_school_rank(df):
    """
    专业排名
    :param df: 
    :return: 
    """
    groups = ("school_name", "degree")
    df = add_median_salary(df, groups)
    sd_df = df.groupby(*groups).agg(
        F.count("*").alias("person_num"),
        F.first("avg_salary").alias("avg_salary"))
    sd_df = sd_df.filter(sd_df.person_num > MIN_NUM)
    # 不限degree分析
    s_df = sd_df.groupby("school_name").agg(
        F.sum("person_num").alias("person_num"),
        F.avg("avg_salary").alias("avg_salary"))
    s_df = s_df.withColumn("degree", F.lit(NA))
    sd_df = sd_df.unionByName(s_df)
    sd_df = sd_df.filter(sd_df.person_num > MIN_NUM)
    sd_df = add_rank(sd_df, "degree")
    return sd_df

Exemple #13

0

Afficher le fichier

Fichier : snapchat_jas.py Projet : rgarde-ias/ww-rgarde_be

def _event_prop(event_type: str, expr: Column) -> Column:
    """Get property from the event of a certain type within the ad session.

    Parameters
    ----------
    event_type
        Event type.
    expr
        Value expression.

    Returns
    -------
    Column
        Column expression that evaluates to the provided `expr` if the event
        type matches the specified one, or None otherwise.

    """
    return first(
        when(col('type') == event_type, expr),
        ignorenulls=True
    )

Exemple #14

0

Afficher le fichier

Fichier : pyspark_cassandra.py Projet : ajithkumar222/python_tour

def pose():
    # SparkSession_2 = SparkSession.newSession()
    spark = SparkSession.builder.appName('csql_demo1').master(
        'local[*]').getOrCreate()
    # spark = SparkSession.builder.appName('csql_demo').master('local[*]').config('spark.jars', 'file:///home/boopathi/Downloads/spark-cassandra-connector-2.4.0-s_2.11.jar').getOrCreate()
    # spark.conf.set('spark.jars', 'file:///home/boopathi/Downloads/postgresql-42.2.7.jar')postgresql-42.2.7.jar
    # spark.newSession() ,.config('spark.jars','file:///home/boopathi/Downloads/*')
    #--------------------
    SparkSession_2 = spark.newSession()
    # query = "(SELECT * FROM attribute_kv) as r"
    query = "(SELECT * FROM attribute_kv WHERE  entity_type = 'DEVICE' ) as r"
    get_data = spark.read.format('jdbc').option(
        'driver', 'org.postgresql.Driver').option(
            'url', 'jdbc:postgresql://192.168.1.36:5432/thingsboard').option(
                "user",
                "postgres").option("password",
                                   "postgres").option('dbtable', query).load()
    dx = get_data.withColumn(
        "value",
        concat_ws("", get_data.bool_v, get_data.long_v, get_data.dbl_v,
                  get_data.json_v, get_data.str_v))
    dx = dx.filter(dx.attribute_type == 'SERVER_SCOPE')
    nl = dx.groupBy('entity_id', 'attribute_type').pivot('attribute_key').agg(
        first('value'))
    ld = nl.withColumnRenamed('entity_id', 'device_id')
    query = "(SELECT name, type, id FROM device) as r"
    sk = spark.read.format('jdbc').option(
        'driver', 'org.postgresql.Driver').option(
            'url', 'jdbc:postgresql://192.168.1.36:5432/thingsboard').option(
                "user",
                "postgres").option("password",
                                   "postgres").option('dbtable', query).load()
    joined_data = ld.join(sk, ld.device_id == sk.id)
    req_det = joined_data.rdd.map(lambda x: [
        x.name, x.device_id, x.attribute_type, x.scNo, x.simNo, x.imeiNumber, x
        .boardNumber, x.zoneName, x.wardName, x.location, x.phase, x.ccmsType,
        x.kva, x.baseWatts, x.baseLine, x.connectedWatts, x.roadType, x.
        latitude, x.longitude
    ]).collect()
    return req_det

Exemple #15

0

Afficher le fichier

def filter_df_on_start_activities_nocc(df, nocc, sa_count0=None, timestamp_key=DEFAULT_TIMESTAMP_KEY,
                                       case_id_glue=CASE_CONCEPT_NAME, activity_key=DEFAULT_NAME_KEY, grouped_df=None):
    """Filters the Spark dataframe on start activities number of occurrences
    """

    if grouped_df is None:
        grouped_df = df.groupby(case_id_glue)
    if sa_count0 is None:
        parameters = {
            PARAMETER_CONSTANT_TIMESTAMP_KEY: timestamp_key,
            PARAMETER_CONSTANT_CASEID_KEY: case_id_glue,
            PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key,
            GROUPED_DATAFRAME: grouped_df
        }
        sa_count0 = get_start_activities(df, parameters=parameters)
    sa_count = [k for k, v in sa_count0.items() if v >= nocc]

    if len(sa_count) < len(sa_count0):
        grouped_df = grouped_df.agg(F.first(activity_key).alias(activity_key+"_1"))
        df_start = grouped_df.filter(grouped_df[activity_key+"_1"].isin(sa_count))
        return df.join(F.broadcast(df_start), grouped_df.columns[0]).drop(activity_key+"_1")
    return df

Exemple #16

0

Afficher le fichier

Fichier : test_group.py Projet : zoelin7/spark

    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100],
                         sorted(
                             g.agg({
                                 "key": "max",
                                 "value": "count"
                             }).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions

        self.assertEqual((0, "99"),
                         tuple(
                             g.agg(functions.first(df.key),
                                   functions.last(df.value)).first()))
        self.assertTrue(
            95 < g.agg(functions.approx_count_distinct(df.key)).first()[0])
        # test deprecated countDistinct
        self.assertEqual(100,
                         g.agg(functions.countDistinct(df.value)).first()[0])

Exemple #17

0

Afficher le fichier

def saveCompletedJobRunTaskData(microBatchDF):

    taskExpode1 = microBatchDF.select(microBatchDF.job_id, microBatchDF.run_id,
                                      explode(microBatchDF.cluster_spec))
    taskExpode2 = taskExpode1.select(taskExpode1.job_id, taskExpode1.run_id,
                                     taskExpode1.key.alias("task_type"),
                                     explode(taskExpode1.value))
    taskDF = taskExpode2.groupBy("job_id", "run_id",
                                 "task_type").pivot("key").agg(first("value"))

    if DeltaTable.isDeltaTable(spark, completed_job_run_task_path):
        # merge data
        deltaTable = DeltaTable.forPath(spark, completed_job_run_task_path)

        (deltaTable.alias("target").merge(
            taskDF.alias("source"),
            "source.job_id=target.job_id and source.run_id=target.run_id").
         whenMatchedUpdateAll().whenNotMatchedInsertAll().execute())

    else:
        (taskDF.write.format("delta").mode("overwrite").option(
            "mergeSchema", "true").save(completed_job_run_task_path))

Exemple #18

0

Afficher le fichier

def levenshtein_cluster(df, input_col):
    """
    Return a dataframe with a string of cluster related to a string
    :param df: Spark Dataframe
    :param input_col:
    :return:
    """
    # Prepare a group so we don't need to apply the fingerprint to the whole data set
    df = df.select(input_col).groupby(input_col).agg(
        F.count(input_col).alias("count"))
    df = keycollision.fingerprint(df, input_col)

    count_col = name_col(input_col, COUNT_COL)
    cluster_col = name_col(input_col, CLUSTER_COL)
    recommended_col = name_col(input_col, RECOMMENDED_COL)
    cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL)
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)

    df_t = df.groupby(fingerprint_col).agg(
        F.collect_list(input_col).alias(cluster_col),
        F.size(F.collect_list(input_col)).alias(cluster_size_col),
        F.first(input_col).alias(recommended_col),
        F.sum("count").alias(count_col)).repartition(1)
    # if Optimus.cache:
    #     df_t = df_t.cache()

    # Filter nearest string
    df_l = levenshtein_filter(df, input_col).repartition(1)

    if Optimus.cache:
        df_l = df_l.cache()

    # Create Cluster
    df_l = df_l.join(df_t, (df_l[input_col + "_FROM"] == df_t[fingerprint_col]), how="left") \
        .cols.drop(fingerprint_col) \
        .cols.drop([input_col + "_FROM", input_col + "_TO", name_col(input_col, "LEVENSHTEIN_DISTANCE")])

    return df_l

Exemple #19

0

Afficher le fichier

def gen_freq_distr_user_data(userDF, attrs):
    userWt = weightCol
    # get weighted frequencies of each category of a user's categorical attributes
    print("[getCategoryFreqs] Grouping records by users")
    categoryFreqInfo = {}
    udf = userDF
    for attr in attrs:
        print("Processing attribute %s" % attr)
        # get wt for each individual user
        _tbl = udf.filter(udf[attr].isNotNull()) \
                  .groupby(userIdCol, attr) \
                  .agg(F.first(userWt).alias(userWt))
        # sum up weight for each values of the attribute
        _tbl = _tbl.groupby(attr).agg(F.sum(userWt).alias('wt'))
        attrInfo = _tbl.collect()
        # build a dic of {attrValue:freq}
        vals = {x[attr]: x['wt'] for x in attrInfo}
        # sum of all occurances of attribute
        tot = sum(vals.values())
        #compute relative freq w.r.t to total occurances of the attribute
        info = {val: float(wt) / tot for val, wt in vals.iteritems()}
        categoryFreqInfo[attr] = info
    return categoryFreqInfo

Exemple #20

0

Afficher le fichier

def statistic_major_position(df):
    """
    专业对应的
    :param df: 
    :return: 
    """
    groups = ("major", "degree", "position_name")
    df = df.filter(df.position_name.isNotNull())
    # 职位别名
    df = df.withColumn("position_title", F.lower(F.trim(df.position_title)))
    pdf = df.groupby("position_name", "position_title").agg(F.count("*").alias("total"))
    pdf = pdf.groupby("position_name").apply(filter_position)
    pdf = pdf.groupby("position_name").agg(F.collect_set("position_title").alias("position_set"))
    pdf = pdf.withColumn("position_alias", F.udf(lambda x: "/".join(x))(pdf.position_set))
    pdf = pdf.select("position_name", "position_alias")
    # 职位对应行业
    idf = df.groupby("position_name", "industry").agg(F.count("*").alias("total"))
    idf = idf.groupby("position_name").apply(filter_industry)
    idf = idf.groupby("position_name").agg(F.collect_set("industry").alias("industry_set"))
    idf = idf.withColumn("industry_alias", F.udf(lambda x: "/".join(x))(idf.industry_set))
    idf = idf.select("position_name", "industry_alias")
    # 限制degree分析
    df = add_median_salary(df, groups)
    mdp_df = df.groupby(*groups).agg(F.count("*").alias("person_num"),
                                     F.first(df.avg_salary).alias("avg_salary"))
    mdp_df = mdp_df.filter(mdp_df.person_num > MIN_NUM)
    # 不限degree分析
    mp_df = mdp_df.groupby("major", "position_name").agg(F.sum("person_num").alias("person_num"),
                                                         F.avg("avg_salary").alias("avg_salary"))
    mp_df = mp_df.withColumn("degree", F.lit(NA))
    mdp_df = mdp_df.unionByName(mp_df)

    # 融合职位别名
    mdp_df = mdp_df.join(pdf, "position_name")
    # 融合职位对应的行业
    mdp_df = mdp_df.join(idf, "position_name")
    return mdp_df

Exemple #21

0

Afficher le fichier

Fichier : cluster_searches.py Projet : jam47/cs5052-spark

def user_cluster_model(spark, ratings, movies, k, genres):
    """ Returns a clustering model for users' genre preferences """

    # Get all user ids
    all_user_ids = ratings.select("userId").distinct().rdd.flatMap(
        lambda x: x).collect()

    # Calculate scores for each user
    scores = user_genre_scores(spark, ratings, movies, all_user_ids)\
        .sort(col("userId"), col("genre"))

    # Convert genres in rows to columns
    scores = scores.groupBy("userId").pivot("genre").agg(
        first("score")).na.fill(0)

    # Ignore movies without genres
    if "(no genres listed)" in scores.columns:
        scores = scores.drop("(no genres listed)")
    scores.cache()

    # Find genres in dataset used
    genres_in_scores = scores.drop("userId").columns

    # Train a k-means model
    scores = VectorAssembler(inputCols=genres_in_scores,
                             outputCol="features").transform(scores)
    kmeans_model = KMeans().setK(k).setSeed(5052).fit(scores)

    # Save genres used in model to model object
    kmeans_model.genres = genres_in_scores

    # Calculate sihlouette score & save to model
    train_predictions = kmeans_model.transform(scores)
    kmeans_model.sihlouette_score = ClusteringEvaluator().evaluate(
        train_predictions)

    return kmeans_model

Exemple #22

0

Afficher le fichier

Fichier : main.py Projet : EBolle/NLP-Weather-dataset

def create_yearly_weather(spark) -> DataFrame:
    """
    Reads in 3 years of daily weather reports throughout the world. After filtering on US stations only, and keeping
    only the most prevalent key weather metrics, the dataframe needs to be pivotted so it can be easily joined with
    the review and distances dataframes.
    """
    yearly_weather_path = f"s3://{s3_bucket}/ghcn/year_*"
    elements_to_keep = ['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN']

    yearly_weather = (spark.read.csv(
        yearly_weather_path, header=False,
        schema=yearly_weather_schema).filter(
            col('element').isin(elements_to_keep)).filter(
                col('station_id').startswith('US')).withColumn(
                    'year', substring(col('date'), 1, 4)).withColumn(
                        'month', substring(col('date'), 5, 2)).withColumn(
                            'day', substring(col('date'), 7, 2)).withColumn(
                                'weather_date',
                                to_date(
                                    concat_ws(
                                        '-', col('year'), col('month'),
                                        col('day')))).select(
                                            col('station_id'),
                                            col('weather_date'),
                                            col('element'),
                                            col('value').cast(
                                                IntegerType())).repartition(
                                                    200, 'station_id',
                                                    'weather_date'))

    yearly_weather_pivot = (yearly_weather.groupby(
        'station_id',
        'weather_date').pivot('element').agg(first('value')).dropna(
            subset=['PRCP', 'TMAX', 'TMIN']).repartition(
                200, 'station_id', 'weather_date'))

    return yearly_weather_pivot

Exemple #23

0

Afficher le fichier

Fichier : opentargets-epmc-analysis-ts-coocs.py Projet : opentargets/platform-etl-backend

def assoc_fn(df: DataFrame, group_by_cols):
    gbc = [col(x) for x in group_by_cols]
    h_fn = partial(harmonic_fn,
                   partition_cols=group_by_cols,
                   over_col="evs_score",
                   output_col=harmonic_col)
    assoc_df = (df.withColumn(
        "evs_score", array_min(array(col("evidence_score") / 10.0, lit(1.0)))
    ).transform(h_fn).groupBy(*gbc).agg(
        countDistinct(col("pmid")).alias("f"),
        mean(col("evidence_score")).alias("mean"),
        stddev(col("evidence_score")).alias("std"),
        max(col("evidence_score")).alias("max"),
        min(col("evidence_score")).alias("min"),
        expr("approx_percentile(evidence_score, array(0.25, 0.5, 0.75))").
        alias("q"),
        count(col("pmid")).alias("N"),
        first(col(harmonic_col)).alias(harmonic_col)).withColumn(
            "median", element_at(col("q"), 2)).withColumn(
                "q1", element_at(col("q"),
                                 1)).withColumn("q3", element_at(col("q"),
                                                                 3)).drop("q"))

    return assoc_df

Exemple #24

0

Afficher le fichier

Fichier : daily_security_summary.py Projet : click66/xetra-analysis

def run_job(trades: DataFrame):
    """
    Generates daily summaries of provided trade data grouped by security.
    Returns a DataFrame with the following columns:
     - Security
     - Date
     - TradedVolume
     - NumberOfTrades
     - StartPrice
     - EndPrice
     - HighPrice
     - LowPrice
     - Volatility
    DataFrame is ordered alphabetically by trade, then in chronological reverse order.
    :param trades: DataFrame
    :return: DataFrame
    """

    with_roc = trades.withColumn(
        "ROC",
        udf(calculate_roc, FloatType())(trades.StartPrice, trades.EndPrice))

    grouped = with_roc.groupBy('Mnemonic', 'Date')\
        .agg(
            sum('TradedVolume').alias('TradedVolume'),
            sum('NumberOfTrades').alias('NumberOfTrades'),
            first('StartPrice').alias('StartPrice'),
            last('EndPrice').alias('EndPrice'),
            max('MaxPrice').alias('HighPrice'),
            min('MinPrice').alias('LowPrice'),
            sum('ROC').alias('Volatility'),
        )\
        .withColumnRenamed('Mnemonic', 'Security') \
        .orderBy(asc('Security'), desc('Date'))

    return grouped

Exemple #25

0

Afficher le fichier

    def process_df(self, df):
        def detect_anomaly(ts):
            """
            Args ts: pandas.series
            rtype: int
            """
            outliers_indices = seasonal_esd(
                ts, hybrid=True, max_anomalies=10)
            return len(outliers_indices)

        grouped_df = df.groupBy(["id"]).agg(F.collect_list("downsample_avg").alias(
            "downsampled_ts"), first("start_ts").alias("start_ts"), last("end_ts").alias("end_ts"))
        anomaly_udf = udf(detect_anomaly, IntegerType())
        processed_df = grouped_df.withColumn("num_anomaly", anomaly_udf(
            "downsampled_avg")).sort(desc("num_anomaly"))
        final_df = processed_df.select(
            "id", "start_ts", "end_ts", "num_anomaly")
        try:
            connector = pgConnector.PostgresConnector(
                "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb", "datanode", "password")
            connector.write(final_df, "global_anomalies_table", "append")
        except Exception as e:
            print(e)
            pass

Exemple #26

0

Afficher le fichier

Fichier : main_trainready.py Projet : helenyu18/blue-marlin

    def group_batched_logs(df_logs_batched):
        # group the logs to generate the train ready data from the basic unit of uckey + interval_time + keyword.
        # group 1: group by uckey + interval_starting_time + keyword
        df = df_logs_batched.groupBy('uckey', 'interval_starting_time', 'keyword_index').agg(
            first('keyword').alias('keyword'),
            fn.sum(col('is_click')).alias('keyword_click_count'),
            fn.count(fn.when(col('is_click') == 0, 1).otherwise(
                0)).alias('keyword_show_count')
        )
        df = df.withColumn('keyword_index_click_count', concat_ws(":", col('keyword_index'), col('keyword_click_count')))
        df = df.withColumn('keyword_index_show_count', concat_ws(":", col('keyword_index'), col('keyword_show_count')))
        df = df.withColumn('keyword_click_count', concat_ws(":", col('keyword'), col('keyword_click_count')))
        df = df.withColumn('keyword_show_count', concat_ws(":", col('keyword'), col('keyword_show_count')))

        # group 2: group by uckey + interval_starting_time
        df = df.groupBy('uckey', 'interval_starting_time').agg(
            concat_ws(",", collect_list('keyword_index')).alias('interval_keyword_indexes'),
            concat_ws(",", collect_list('keyword_index_click_count')).alias('interval_keyword_indexes_click_counts'),
            concat_ws(",", collect_list('keyword_index_show_count')).alias('interval_keyword_indexes_show_counts'),
            concat_ws(",", collect_list('keyword')).alias('interval_keywords'),
            concat_ws(",", collect_list('keyword_click_count')).alias('interval_keywords_click_counts'),
            concat_ws(",", collect_list('keyword_show_count')).alias('interval_keywords_show_counts')
        )
        return df

Exemple #27

0

Afficher le fichier

Fichier : pivotSummary.py Projet : azihna/OneShotScripts

def pivotSummary(df: DataFrame) -> DataFrame:
    '''
		Combined with the melt function above this function takes in a summary of a dataframe 
		calculated by `.describe()` function and outputs it in a long and more readable format
		especially in case of dataframes with many variables.
		'''

    schema = df.schema

    slist = []
    for i in schema:
        slist.append(i.name)

    id1 = slist[0]

    slist.remove('summary')

    longFormat = melt(df, id_vars=[id1], value_vars=slist)

    wideDF = longFormat.groupBy('variable').pivot(
        'summary',
        ['count', 'mean', 'stddev', 'min', 'max']).agg(first('value'))

    return wideDF

Exemple #28

0

Afficher le fichier

    idf_model.write().overwrite().save(
        "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/idf_model")

    # df_standard = idf_model.transform(df_standard)
    # df_standard.show()

    # --------- 从这里开始应该重启一个job 我偷懒没有写 ------------

    # 5. 利用原有的crossjoin 数据构建，公司的label
    df_result = load_training_data(spark)  # 待清洗数据

    # 为生产厂商构建新label
    df_mnf_label = df_result.where(df_result.label == 1.0).select(
        "id", "MANUFACTURER_NAME", "MANUFACTURER_NAME_STANDARD",
        "MANUFACTURER_NAME_EN_STANDARD")
    df_mnf_label = df_mnf_label.groupBy("id").agg(first(df_mnf_label.MANUFACTURER_NAME).alias("MANUFACTURER_NAME_ANSWER"), \
                first(df_mnf_label.MANUFACTURER_NAME_STANDARD).alias("MANUFACTURER_NAME_STANDARD_ANSWER"), \
                first(df_mnf_label.MANUFACTURER_NAME_EN_STANDARD).alias("MANUFACTURER_NAME_EN_STANDARD_ANSWER"))
    df_result = df_result.join(df_mnf_label, how="left", on="id")
    df_result = df_result.withColumn(
        "mnf_label",
        when((df_result.MANUFACTURER_NAME_STANDARD
              == df_result.MANUFACTURER_NAME_STANDARD_ANSWER) |
             (df_result.MANUFACTURER_NAME_EN_STANDARD
              == df_result.MANUFACTURER_NAME_EN_STANDARD_ANSWER),
             1.0).otherwise(0.0))
    # df_result.select("id", "MANUFACTURER_NAME", "MANUFACTURER_NAME_STANDARD","MANUFACTURER_NAME_STANDARD_ANSWER", "mnf_label").show()
    df_result = df_result.drop("MANUFACTURER_NAME_ANSWER",
                               "MANUFACTURER_NAME_STANDARD_ANSWER",
                               "MANUFACTURER_NAME_EN_STANDARD_ANSWER")

Exemple #29

0

Afficher le fichier

Fichier : pyspark-aggregate.py Projet : GCPBigData/pyspark-examples-1

df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department &amp; Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
    stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
  .show(truncate=False)

Exemple #30

0

Afficher le fichier

def agg_first(col):
    return first(col).alias(col)

Exemple #31

0

Afficher le fichier

Fichier : trajectoryhasher.py Projet : SinRas/trajsim

 def hash(self, df_trajectory_processed, df_type='pandas'):
     # Assert Implemented Methods
     assert df_type in {
         'pandas', 'spark'
     }, 'hash@<TrajectoryHasherJacardEstimation>: df_type = "{}" is not implemented!'.format(
         df_type)
     # Hash
     if (df_type == 'pandas'):
         # Bounds
         id_timestamp_min = df_trajectory_processed['id_timestamp'].min()
         id_timestamp_max = df_trajectory_processed['id_timestamp'].max()
         # Select ID TimeStamp for Hashes
         id_timestamps_selected = np.random.choice(
             np.arange(id_timestamp_min, id_timestamp_max + 1),
             self.n_hashes,
             replace=False) if self.n_hashes < (
                 id_timestamp_max - id_timestamp_min + 1) else list(
                     range(id_timestamp_min, id_timestamp_max + 1))
         id_timestamps_selected_set = set(id_timestamps_selected)
         # Filter
         df_result = df_trajectory_processed[
             df_trajectory_processed['id_timestamp'].map(
                 lambda x: x in id_timestamps_selected_set)].copy(
                 ).sort_values(['id_user', 'id_timestamp'])
         # Index Locations
         location_indices = []
         lat_lng_to_idx = dict()
         for lat, lng in zip(df_result['lat'], df_result['lng']):
             key = (lat, lng)
             if (not key in lat_lng_to_idx):
                 lat_lng_to_idx[key] = len(lat_lng_to_idx)
             # Add New Index
             location_indices.append(lat_lng_to_idx[key])
         df_result['location_indices'] = location_indices
         # Calculate Hashes
         df_hashes = None
         for i, id_timestamp in enumerate(id_timestamps_selected):
             if (df_hashes is None):
                 df_hashes = df_result[df_result['id_timestamp'] ==
                                       id_timestamp][[
                                           'id_user', 'location_indices'
                                       ]].copy()
             else:
                 df_hashes['location_indices'] = df_result[
                     df_result['id_timestamp'] ==
                     id_timestamp]['location_indices'].values
             # Rename New Column
             colname = 'hash_{}'.format(i)
             df_hashes.rename(columns={'location_indices': colname},
                              inplace=True)
     elif (df_type == 'spark'):
         # ID TimeStamp Bounds
         row = df_trajectory_processed.agg(
             sql_functions.min(sql_functions.col("id_timestamp")).alias(
                 "id_timestamp_min"),
             sql_functions.max(sql_functions.col("id_timestamp")).alias(
                 "id_timestamp_max")).head()
         id_timestamp_min, id_timestamp_max = row['id_timestamp_min'], row[
             'id_timestamp_max']
         # Chosen TimeStamps
         id_timestamps_selected = np.random.choice(
             np.arange(id_timestamp_min, id_timestamp_max + 1),
             self.n_hashes,
             replace=False).tolist() if self.n_hashes < (
                 id_timestamp_max - id_timestamp_min + 1) else list(
                     range(id_timestamp_min, id_timestamp_max + 1))
         # Create SparkDataFrame
         df_id_timestamps = self.params['spark'].createDataFrame(
             [[id_timestamp, 'hash_{}'.format(i)]
              for i, id_timestamp in enumerate(id_timestamps_selected)],
             schema=sql_types.StructType([
                 sql_types.StructField('id_timestamp',
                                       sql_types.IntegerType(), False),
                 sql_types.StructField('hash_name', sql_types.StringType(),
                                       False),
             ]))
         # ID Locations
         df_location_ids = df_trajectory_processed.select("lat", "lng").\
             distinct().withColumn(
                 "id_location",
                 sql_functions.row_number().over(
                     Window.orderBy("lat", "lng")
                 )
             )
         # Join Hashes
         df_result = df_trajectory_processed.join(df_id_timestamps,
                                                  on=['id_timestamp'],
                                                  how='inner').join(
                                                      df_location_ids,
                                                      on=['lat', 'lng'],
                                                      how='inner')
         # Turn Into Table
         df_hashes = df_result.groupby("id_user").\
             pivot( "hash_name" ).\
             agg( sql_functions.first( "id_location" ) )
     # Return
     return (df_hashes)

Exemple #32

0

Afficher le fichier

Fichier : house.py Projet : poojk/BnB-Pay

    key = my_bucket_object.key
    tablename=key.split("_")
    table = tablename[0]
    val = tablename[2].split(".")[0]
    path = f's3a://{bucket}/{key}'
    df = spark.read.option("header",True).csv(path)
    s = [s for s in df.columns if '2019' in s]
    selected = ['State','Metro'] + s
    df2=df.select(*selected)
    newdf = df2.withColumn('house_average', sum(df2[col] for col in s)/len(s))
    df1 = newdf.withColumn("house_average", F.round(newdf["house_average"], 1))
    d = df1.drop(*s)
    d = d.withColumn('bedrooms',F.lit(val))
    d = d.withColumnRenamed('Metro','city')
    d = d.dropna(subset=["city"])
    d = d.groupBy('state').agg(F.avg('house_average').alias('house_average'), F.first('bedrooms'))
    #d = d.withColumnRenamed('first(state)','state')
    d = d.withColumnRenamed('first(bedrooms)','bedrooms')
    d = d.withColumn("house_average", F.round(d["house_average"], 1))
    d = d.sort('state')
    d.show()
        d.write \
            .format("jdbc") \
            .option("url","jdbc:postgresql://10.0.0.8:5432/my_db") \
            .option("dbtable","house_prices") \
            .option("user","test") \
            .option("password","test") \
            .option("driver","org.postgresql.Driver") \
            .mode("Append") \
            .save()

Exemple #33

0

Afficher le fichier

Fichier : generateDOIBoost.py Projet : VladimirAlexiev/doiBoost

    sc = SparkContext(appName='generateDOIBoost')
    spark = SparkSession(sc)

    #Loading CrossRef Dataframe
    crossref = spark.read.load('/data/df/crossref.parquet', format="parquet")

    #Loading MAG Dataframe
    microsoft = spark.read.load("/data/df/mag.parquet", format="parquet")

    #Alias each column with _mag
    microsoft = microsoft.select(*(col(x).alias(x + '_mag')
                                   for x in microsoft.columns))

    #Group By DOI since we have repeated doi with multiple abstract, at the moment we take the first One
    mag = microsoft.groupBy('doi_mag').agg(
        first('authors_mag').alias('author_mag'),
        first('abstract_mag').alias('abstract_mag'),
        first('collectedFom_mag').alias('collectedFrom_mag'))

    #Load ORCID DataFrame
    orcid = spark.read.load("/data/df/ORCID.parquet", format="parquet")

    #Fix missing value in collectedFrom
    orcid = orcid.withColumn('collectedFrom', array(lit('ORCID')))

    #Alias each column with _orchid
    orcid = orcid.select(*(col(x).alias(x + '_orcid') for x in orcid.columns))

    #Load UnpayWall DataFrame
    uw = spark.read.load("/data/df/unpaywall.parquet", format="parquet")