Exemple #1
0
 def minute(self) -> ks.Series:
     """
     The minutes of the datetime.
     """
     return _wrap_accessor_spark(self, F.minute,
                                 LongType()).alias(self.name)
Exemple #2
0
    windowSpec = Window.partitionBy("DISTRICT") \
        .orderBy(col("Lat").desc())

    crimeFacts.select(
        "DISTRICT", "Lat",
        func.max("Lat").over(windowSpec).alias("max_lat")).show()

    print(crimeFacts.rdd.take(5))

    from pyspark.sql.types import LongType
    from pyspark.sql.functions import udf

    def squared_typed(s):
        return s * s

    squared_udf = udf(squared_typed, LongType())

    crimeFacts.select("hour", squared_udf("hour")).show()

    offenseCodes = spark.read\
        .option("header", "true")\
        .option("inferSchema", "true")\
        .csv("boston_crimes/offense_codes.csv")
    offenseCodes.show()

    robberyStats = crimeFacts\
        .join(offenseCodes, offenseCodes.CODE == crimeFacts.OFFENSE_CODE)\
        .filter(offenseCodes.NAME.contains("ROBBERY"))\
        .groupBy(offenseCodes.NAME)\
        .count()\
        .orderBy(col("count").desc())
Exemple #3
0
    def attach_id_column(self, id_type: str, column: Union[Any, Tuple]) -> "DataFrame":
        """
        Attach a column to be used as identifier of rows similar to the default index.

        See also `Default Index type
        <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_.

        Parameters
        ----------
        id_type : string
            The id type.

            - 'sequence' : a sequence that increases one by one.

              .. note:: this uses Spark's Window without specifying partition specification.
                  This leads to move all data into single partition in single machine and
                  could cause serious performance degradation.
                  Avoid this method against very large dataset.

            - 'distributed-sequence' : a sequence that increases one by one,
              by group-by and group-map approach in a distributed manner.
            - 'distributed' : a monotonically increasing sequence simply by using PySpark’s
              monotonically_increasing_id function in a fully distributed manner.

        column : string or tuple of string
            The column name.

        Returns
        -------
        DataFrame
            The DataFrame attached the column.

        Examples
        --------
        >>> df = ps.DataFrame({"x": ['a', 'b', 'c']})
        >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column="id")
           x  id
        0  a   0
        1  b   1
        2  c   2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=0)
           x  0
        0  a  0
        1  b  1
        2  c  2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed", column=0.0)
        ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
           x  0.0
        0  a  ...
        1  b  ...
        2  c  ...

        For multi-index columns:

        >>> df = ps.DataFrame({("x", "y"): ['a', 'b', 'c']})
        >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column=("id-x", "id-y"))
           x id-x
           y id-y
        0  a    0
        1  b    1
        2  c    2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=(0, 1.0))
           x   0
           y 1.0
        0  a   0
        1  b   1
        2  c   2
        """
        from pyspark.pandas.frame import DataFrame

        if id_type == "sequence":
            attach_func = InternalFrame.attach_sequence_column
        elif id_type == "distributed-sequence":
            attach_func = InternalFrame.attach_distributed_sequence_column
        elif id_type == "distributed":
            attach_func = InternalFrame.attach_distributed_column
        else:
            raise ValueError(
                "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'"
            )

        assert is_name_like_value(column, allow_none=False), column
        if not is_name_like_tuple(column):
            column = (column,)

        internal = self._psdf._internal

        if len(column) != internal.column_labels_level:
            raise ValueError(
                "The given column `{}` must be the same length as the existing columns.".format(
                    column
                )
            )
        elif column in internal.column_labels:
            raise ValueError(
                "The given column `{}` already exists.".format(name_like_string(column))
            )

        # Make sure the underlying Spark column names are the form of
        # `name_like_string(column_label)`.
        sdf = internal.spark_frame.select(
            [
                scol.alias(SPARK_INDEX_NAME_FORMAT(i))
                for i, scol in enumerate(internal.index_spark_columns)
            ]
            + [
                scol.alias(name_like_string(label))
                for scol, label in zip(internal.data_spark_columns, internal.column_labels)
            ]
        )
        sdf, force_nullable = attach_func(sdf, name_like_string(column))

        return DataFrame(
            InternalFrame(
                spark_frame=sdf,
                index_spark_columns=[
                    scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level)
                ],
                index_names=internal.index_names,
                index_fields=(
                    [field.copy(nullable=True) for field in internal.index_fields]
                    if force_nullable
                    else internal.index_fields
                ),
                column_labels=internal.column_labels + [column],
                data_spark_columns=(
                    [scol_for(sdf, name_like_string(label)) for label in internal.column_labels]
                    + [scol_for(sdf, name_like_string(column))]
                ),
                data_fields=(
                    (
                        [field.copy(nullable=True) for field in internal.data_fields]
                        if force_nullable
                        else internal.data_fields
                    )
                    + [
                        InternalField.from_struct_field(
                            StructField(name_like_string(column), LongType(), nullable=False)
                        )
                    ]
                ),
                column_label_names=internal.column_label_names,
            ).resolved_copy
        )
def process_log_data(spark, input_data, output_data):
    """ Processing log data (users, time table, songplay) by the JSON given by S3,
        after data normalization and transformation
        these data are wrote as parquet files """   
    
    """ Proving JSON structure to Spark """
    logdata_schema = StructType([
        StructField("artist", StringType(), True),
        StructField("auth", StringType(), True),
        StructField("firstName", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("itemInSession", LongType(), True),
        StructField("lastName", StringType(), True),
        StructField("length", DoubleType(), True),
        StructField("level", StringType(), True),
        StructField("location", StringType(), True),
        StructField("method", StringType(), True),
        StructField("page", StringType(), True),
        StructField("registration", DoubleType(), True),
        StructField("sessionId", LongType(), True),
        StructField("song", StringType(), True),
        StructField("status", LongType(), True),
        StructField("ts", LongType(), True),
        StructField("userAgent", StringType(), True),
        StructField("userId", StringType(), True),
    ])
        
    # get filepath to log data file
    log_data = input_data + 'log-data'

    # read log data file, JSON structure
    df = spark.read.json(log_data, schema = logdata_schema)
    
    # filter by actions for song plays
    df = df.filter(col("page") == 'NextSong')
    
    # extract columns for users table
    users_table = df.select(col("userId").alias("user_id"),col("firstName").alias("first_name"),
                            col("lastName").alias("last_name"),"gender","level")
    
    # write users table to parquet files
    users_table.write.parquet(output_data+"users")

    tsFormat = "yyyy-MM-dd HH:MM:ss z"
    # Converting ts to a timestamp format    
    time_table = df.withColumn('ts',
                               to_timestamp(date_format((df.ts 
                                                         /1000).cast(dataType=TimestampType()), tsFormat), tsFormat))

    # extract columns to create time table    
    time_table = time_table.select(col("ts").alias("start_time"),
                                   hour(col("ts")).alias("hour"),
                                   dayofmonth(col("ts")).alias("day"), 
                                   weekofyear(col("ts")).alias("week"), 
                                   month(col("ts")).alias("month"),
                                   year(col("ts")).alias("year"))

    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year","month").parquet(output_data+"time")

    # read in song data to use for songplays table
    song_data = input_data+"song-data/*/*/*/*.json"
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = song_df.join(df, song_df.artist_name==df.artist).
    withColumn("songplay_id", monotonically_increasing_id()).
    withColumn('start_time', to_timestamp(date_format((col("ts") /1000).cast(dataType=TimestampType()), tsFormat),tsFormat)).                             select("songplay_id",
           "start_time",                         
           col("userId").alias("user_id"),
           "level",
           "song_id",
           "artist_id",
           col("sessionId").alias("session_id"),
           col("artist_location").alias("location"),
           "userAgent",
           month(col("start_time")).alias("month"),
           year(col("start_time")).alias("year"))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year","month").parquet(output_data+"songplays")
# Item_based Collaborative Filtering ---GraphFrames, DataFrame
sc.addPyFile("/Users/soober/Downloads/graphframes-0.7.0-spark2.4-s_2.11.jar")

from graphframes import *
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, StringType

UserAnimeSchema = StructType([
    StructField("username", StringType(), True),
    StructField("animeId", IntegerType(), True),
    StructField("watched_episodes", LongType(), True),
    StructField("start_date", StringType(), True),
    StructField("finish_date", StringType(), True),
    StructField("rating", IntegerType(), True),
    StructField("status", IntegerType(), True),
    StructField("rewatching", StringType(), True),
    StructField("rewatching_ep", LongType(), True),
    StructField("last_updated_date", StringType(), True),
    StructField("tags", StringType(), True),
])

user_anime = spark.read.schema(UserAnimeSchema) \
    .option("header", "false") \
    .option("mode", "DROPMALFORMED") \
    .csv(RATIING_DATASET)

raw_edges = user_anime.select("username", "animeId", "rating") \
    .na.drop(subset=["username", "animeId"]) \
    .filter("rating <= 10 and rating >= 1")

userlist = spark.read.csv(USER_DATASET, header="true", inferSchema="true") \
    .select("username", "user_id") \
import pyspark.sql.functions as fspark
import collections

spark = SparkSession.builder \
                    .master("local")\
                    .appName('abc') \
                    .config("spark.some.config.option", "some-value")\
                    .enableHiveSupport()\
                    .getOrCreate()
spark.conf
spark.range(3).show()

myManualSchema = StructType([
  StructField("some", StringType(), True),
  StructField("col", StringType(), True),
  StructField("names", LongType(), False)
])
myRow = Row("Hello", "hi", 1)
myDf = spark.createDataFrame([myRow], myManualSchema)
myDf.show()
myDf.select("col")
myDf.select(expr("col as c"),col("col").alias("c1"),lit(1)).show()
myDf.select(expr("col as c"),col("col").alias("c1"),lit(1)).distinct().count()
myDf.selectExpr("*","col as newColumnName", "col as wer" ).show(2)
myDf.selectExpr("col as `This Long Column-Name`" ).show(2)
myDf.withColumn("numberOne", lit(1) * expr("names")).show(2)
myDf.withColumn("This Long Column-Name",expr("col")).show()
myDf.withColumnRenamed("col", "dest").columns
myDf.drop("col").columns
myDf.filter(col("names") == 1).show()
myDf.filter("names == 1").show()
Exemple #7
0
    return {
        'engine': engine,
        'source': source,
        'count': count,
    }


addons_type = ArrayType(
    StructType([
        StructField('addon_id', StringType(), False),
        StructField('blocklisted', BooleanType(), True),
        StructField('name', StringType(), True),
        StructField('user_disabled', BooleanType(), True),
        StructField('app_disabled', BooleanType(), True),
        StructField('version', StringType(), True),
        StructField('scope', LongType(), True),
        StructField('type', StringType(), True),
        StructField('foreign_install', BooleanType(), True),
        StructField('has_binary_components', BooleanType(), True),
        StructField('install_day', LongType(), True),
        StructField('update_day', LongType(), True),
        StructField('signed_state', LongType(), True),
        StructField('is_system', BooleanType(), True),
        StructField('is_web_extension', BooleanType(), True),
        StructField('multiprocess_compatible', BooleanType(), True),
    ]))


def generate_addon(addon_id, name, version):
    return {
        'addon_id': addon_id,
Exemple #8
0
   # signature.append(repoName)
   # signature.append(repoPath)
    # For each of the random hash functions...
    	for i in range(0, numHashes):
        	minHashCode = nextPrime + 1
        	for shingleID in shinglesInDoc:
            		hashCode = (valueA[i] * shingleID + valueB[i]) % nextPrime
            	if hashCode < minHashCode:
                	minHashCode = hashCode
        	signature.append(minHashCode)
    #signatures.append(signature)
	elapsed = (time.time() - t0)
	return signature
	#print(signatures)

func_udf = udf(generate_shingel_minhash, ArrayType(LongType()))
df = df.withColumn('ten_signatures', func_udf("content"))
#df.select('ten_signatures').show()



df.printSchema()
def insert_db(df2):
	config = configparser.ConfigParser()
	config.read('config.ini')
	url = "jdbc:mysql://localhost/insight"
	properties = {
        "user": config['mysqlDB']['user'],
        "password": config['mysqlDB']['pass'],
        "driver": config['mysqlDB']['driver']
    }
Exemple #9
0
    df_coll_need = df_coll.withColumn('X', zip_(df_coll.food, df_coll.price))\
                    .drop('food').drop('price')
    df_coll_need.show(truncate=False)

    # ---------------------------
    # 3- 单列聚合 agg() + dict
    # ---------------------------
    df.groupBy("name").agg({'price': 'mean'}).show()
    df.groupBy("name").agg({'price': 'max'}).show()

    # ---------------------------
    # 4- udf
    # ---------------------------
    data = [('alex', 5), ('jane', 7), ('bob', 9)]
    df_udf = spark.createDataFrame(data, ['name', 'age'])
    sqrt_udf = udf(m_sqrt, LongType())
    df_udfed = df_udf.select('name', 'age', sqrt_udf('age').alias('age_sqrt'))
    df_udfed.show()

    # ---------------------------
    # 5- df from pandas
    # ---------------------------
    pd_df = pd.DataFrame(
        data={
            'integers': [2, 5, 7, 8, 9],
            'floats': [1.2, -2.0, 1.5, 2.7, 3.6],
            'int_arrays': [[6], [1, 2], [3, 4, 5], [6, 7, 8, 9], [10, 11, 12]]
        })
    spark_df = spark.createDataFrame(pd_df)
    spark_df.show()
    # 转回pandas
import findspark

findspark.init()

import json
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, FloatType, BooleanType, \
    DoubleType, ShortType

type_map = {
    'int': IntegerType(),
    'bigint': LongType(),
    'smallint': ShortType(),
    'float': FloatType(),
    'double': DoubleType(),
    'string': StringType(),
    'boolean': BooleanType(),
}


class LoadProdDataJob:
    def __init__(self, database, target_table, file_path, partition_columns,
                 dfs_path, schema_path):
        self.database = database
        self.target_table = target_table
        self.file_path = file_path
        self.partition_columns = partition_columns
        self.dfs_path = dfs_path
        self.schema_path = schema_path
Exemple #11
0
def compute_churn_week(df, week_start):
    """Compute the churn data for this week. Note that it takes 10 days
    from the end of this period for all the activity to arrive. This data
    should be from Sunday through Saturday.

    df: DataFrame of the dataset relevant to computing the churn
    week_start: datestring of this time period"""

    week_start_date = datetime.strptime(week_start, "%Y%m%d")
    week_end_date = week_start_date + timedelta(6)
    week_start = fmt(week_start_date)
    week_end = fmt(week_end_date)

    # Verify that the start date is a Sunday
    if week_start_date.weekday() != 6:
        msg = "Week start date {} is not a Sunday".format(week_start)
        raise RuntimeError(msg)

    # If the data for this week can still be coming, don't try to compute the
    # churn.
    week_end_slop = fmt(week_end_date + timedelta(10))
    today = fmt(datetime.utcnow())
    if week_end_slop >= today:
        msg = ("Skipping week of {} to {} - Data is still arriving until {}.".
               format(week_start, week_end, week_end_slop))
        raise RuntimeError(msg)

    logger.info("Starting week from {} to {}".format(week_start, week_end))

    # the subsession_start_date field has a different form than
    # submission_date_s3, so needs to be formatted with hyphens.
    week_end_excl = fmt(week_end_date + timedelta(1), date_format="%Y-%m-%d")
    week_start_hyphenated = fmt(week_start_date, date_format="%Y-%m-%d")

    current_week = (df.filter(df['submission_date_s3'] >= week_start).filter(
        df['submission_date_s3'] <= week_end_slop).filter(
            df['subsession_start_date'] >= week_start_hyphenated).filter(
                df['subsession_start_date'] < week_end_excl))

    # take a subset and rename the app_version field
    current_week = (current_week.select(source_columns).withColumnRenamed(
        "scalar_parent_browser_engagement_total_uri_count",
        "total_uri_count").withColumnRenamed(
            "scalar_parent_browser_engagement_unique_domains_count",
            "unique_domains_count").withColumnRenamed("app_version",
                                                      "version"))

    # clean some of the aggregate fields
    current_week = current_week.na.fill(
        0, ["total_uri_count", "unique_domains_count"])

    # Clamp broken subsession values in the [0, MAX_SUBSESSION_LENGTH] range.
    clamped_subsession_subquery = (F.when(
        F.col('subsession_length') > MAX_SUBSESSION_LENGTH,
        MAX_SUBSESSION_LENGTH).otherwise(
            F.when(F.col('subsession_length') < 0,
                   0).otherwise(F.col('subsession_length'))))

    # Compute per client aggregates lost during newest client computation
    per_client_aggregates = (current_week.select(
        'client_id', 'total_uri_count', 'unique_domains_count',
        clamped_subsession_subquery.alias('subsession_length')).groupby(
            'client_id').agg(
                F.sum('subsession_length').alias('usage_seconds'),
                F.sum('total_uri_count').alias('total_uri_count_per_client'),
                F.avg('unique_domains_count').alias(
                    'average_unique_domains_count_per_client')))

    # Get the newest ping per client and append to original dataframe
    newest_per_client = get_newest_per_client(current_week)
    newest_with_usage = newest_per_client.join(per_client_aggregates,
                                               'client_id', 'inner')

    # Build the "effective version" cache:
    d2v = make_d2v(get_release_info())

    converted = newest_with_usage.rdd.map(
        lambda x: convert(d2v, week_start, x))
    """
    - channel (appUpdateChannel)
    - geo (bucketed into top 30 countries + "rest of world")
    - is_funnelcake (contains "-cck-"?)
    - acquisition_period (cohort_week)
    - start_version (effective version on profile creation date)
    - sync_usage ("no", "single" or "multiple" devices)
    - current_version (current appVersion)
    - current_week (week)
    - source (associated attribution)
    - medium (associated with attribution)
    - campaign (associated with attribution)
    - content (associated with attribution)
    - distribution_id (funnelcake associated with profile)
    - default_search_engine
    - locale
    - is_active (were the client_ids active this week or not)
    - n_profiles (count of matching client_ids)
    - usage_hours (sum of the per-client subsession lengths,
            clamped in the [0, MAX_SUBSESSION_LENGTH] range)
    - sum_squared_usage_hours (the sum of squares of the usage hours)
    - total_uri_count (sum of per-client uri counts)
    - unique_domains_count_per_profile (average of the average unique
             domains per-client)
    """
    churn_schema = StructType([
        StructField('channel', StringType(), True),
        StructField('geo', StringType(), True),
        StructField('is_funnelcake', StringType(), True),
        StructField('acquisition_period', StringType(), True),
        StructField('start_version', StringType(), True),
        StructField('sync_usage', StringType(), True),
        StructField('current_version', StringType(), True),
        StructField('current_week', LongType(), True),
        StructField('source', StringType(), True),
        StructField('medium', StringType(), True),
        StructField('campaign', StringType(), True),
        StructField('content', StringType(), True),
        StructField('distribution_id', StringType(), True),
        StructField('default_search_engine', StringType(), True),
        StructField('locale', StringType(), True),
        StructField('is_active', StringType(), True),
        StructField('n_profiles', LongType(), True),
        StructField('usage_hours', DoubleType(), True),
        StructField('sum_squared_usage_hours', DoubleType(), True),
        StructField('total_uri_count', LongType(), True),
        StructField('unique_domains_count', DoubleType(), True)
    ])

    # Don't bother to filter out non-good records - they will appear
    # as 'unknown' in the output.
    countable = converted.map(lambda x: (
        (
            # attributes unique to a client
            x.get('channel', 'unknown'),
            x.get('geo', 'unknown'),
            "yes" if x.get('is_funnelcake', False) else "no",
            datetime.strftime(x.get('acquisition_period', date(2000, 1, 1)),
                              "%Y-%m-%d"),
            x.get('start_version', 'unknown'),
            x.get('sync_usage', 'unknown'),
            x.get('current_version', 'unknown'),
            x.get('current_week', -1),
            x.get('source', 'unknown'),
            x.get('medium', 'unknown'),
            x.get('campaign', 'unknown'),
            x.get('content', 'unknown'),
            x.get('distribution_id', 'unknown'),
            x.get('default_search_engine', 'unknown'),
            x.get('locale', 'unknown'),
            x.get('is_active', 'unknown')),
        (
            1,  # active users
            x.get('usage_hours', 0.0),
            x.get('squared_usage_hours', 0.0),
            x.get('total_uri_count', 0),
            x.get('unique_domains_count', 0.0))))

    def reduce_func(x, y):
        return tuple(map(sum, zip(x, y)))

    aggregated = countable.reduceByKey(reduce_func)

    records_df = aggregated.map(lambda x: x[0] + x[1]).toDF(churn_schema)

    # Apply some post-processing for other aggregates
    # (i.e. unique_domains_count). This needs to be done when you want
    # something other than just a simple sum
    def average(total, n):
        if not n:
            return 0.0
        return float(total) / n

    average_udf = F.udf(average, DoubleType())

    # Create new derived columns and drop any unnecessary ones
    records_df = (
        records_df
        # The total number of unique domains divided by the number of profiles
        # over a set of dimensions. This should be aggregated using a weighted
        # mean, i.e. sum(unique_domains_count_per_profile * n_profiles)
        .withColumn('unique_domains_count_per_profile',
                    average_udf(F.col('unique_domains_count'),
                                F.col('n_profiles')))
        # This value is meaningless because of overlapping domains between
        # profiles
        .drop('unique_domains_count')
    )

    return records_df
Exemple #12
0
def expected_search_clients_daily_data(define_dataframe_factory):
    # template for the expected results
    factory = define_dataframe_factory(
        list(
            map(
                to_field,
                [
                    ("client_id", "a", StringType(), False),
                    ("sample_id", "42", StringType(), False),
                    ("submission_date", "20170101", StringType(), False),
                    ("os", "windows", StringType(), True),
                    ("channel", "release", StringType(), True),
                    ("country", "DE", StringType(), True),
                    ("locale", "de", StringType(), True),
                    ("search_cohort", None, StringType(), True),
                    ("app_version", "54.0.1", StringType(), True),
                    ("distribution_id", None, StringType(), True),
                    ("addon_version", "0.9.5", StringType(), False),
                    ("engine", "google", StringType(), True),
                    ("source", "urlbar", StringType(), True),
                    ("tagged-sap", None, LongType(), True),
                    ("tagged-follow-on", None, LongType(), True),
                    ("tagged_sap", None, LongType(), True),
                    ("tagged_follow_on", None, LongType(), True),
                    ("sap", 4, LongType(), True),
                    ("organic", None, LongType(), True),
                    ("unknown", None, LongType(), True),
                    # Roughly 2016-01-01
                    ("profile_creation_date", 16801, LongType(), False),
                    ("default_search_engine", "google", StringType(), False),
                    (
                        "default_search_engine_data_load_path",
                        "jar:[app]/omni.ja!browser/google.xml",
                        StringType(),
                        False,
                    ),
                    (
                        "default_search_engine_data_submission_url",
                        "https://www.google.com/search?q=&ie=utf-8&oe=utf-8&client=firefox-b",
                        StringType(),
                        False,
                    ),
                    ("sessions_started_on_this_day", 1, LongType(), True),
                    ("profile_age_in_days", 366, LongType(), True),
                    ("subsession_hours_sum", 1.0, DoubleType(), True),
                    ("active_addons_count_mean", 2.0, DoubleType(), True),
                    ("max_concurrent_tab_count_max", 10, LongType(), True),
                    ("tab_open_event_count_sum", 5, LongType(), True),
                    ("active_hours_sum", 0.5, DoubleType(), True),
                ],
            )))

    return factory([
        {
            "client_id": "b",
            "country": "US"
        },
        # Covers 5 dupe rows and custom app_version, distribution_id rows
        {
            "app_version": "52.0.3",
            "sap": 28,
            "sessions_started_on_this_day": 7,
            "subsession_hours_sum": 7.0,
            "tab_open_event_count_sum": 35,
            "active_hours_sum": 3.5,
        },
        {
            "engine": "bing"
        },
        {
            "engine": "yahoo"
        },
        {
            "client_id": "c",
            "unknown": None,
            "sap": 0,
            "tagged-sap": None,
            "tagged-follow-on": None,
            "tagged_sap": None,
            "tagged_follow_on": None,
            "source": None,
            "engine": None,
        },
    ])
Exemple #13
0

# Boilerplate for generating example main_summary tables
def generate_search_count(engine="google", source="urlbar", count=4):
    return {"engine": engine, "source": source, "count": count}


addons_type = ArrayType(
    StructType([
        StructField("addon_id", StringType(), False),
        StructField("blocklisted", BooleanType(), True),
        StructField("name", StringType(), True),
        StructField("user_disabled", BooleanType(), True),
        StructField("app_disabled", BooleanType(), True),
        StructField("version", StringType(), True),
        StructField("scope", LongType(), True),
        StructField("type", StringType(), True),
        StructField("foreign_install", BooleanType(), True),
        StructField("has_binary_components", BooleanType(), True),
        StructField("install_day", LongType(), True),
        StructField("update_day", LongType(), True),
        StructField("signed_state", LongType(), True),
        StructField("is_system", BooleanType(), True),
        StructField("is_web_extension", BooleanType(), True),
        StructField("multiprocess_compatible", BooleanType(), True),
    ]))


def generate_addon(addon_id, name, version):
    return {"addon_id": addon_id, "name": name, "version": version}
Exemple #14
0
 def second(self) -> ks.Series:
     """
     The seconds of the datetime.
     """
     return _wrap_accessor_spark(self, F.second,
                                 LongType()).alias(self.name)
Exemple #15
0
# 2. 创建dataframe
# 2.1. 从变量创建
stringCSVRDD = spark.sparkContext.parallelize([(123, "Katie", 19, "brown"),
                                               (234, "Michael", 22, "green"),
                                               (345, "Simone", 23, "blue")])

# 指定模式, StructField(name,dataType,nullable)
# 其中:
# name: 该字段的名字,
# dataType:该字段的数据类型,
# nullable: 指示该字段的值是否为空
from pyspark.sql.types import StructType, StructField, LongType, StringType  # 导入类型

schema = StructType([
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("eyeColor", StringType(), True)
])

# 对RDD应用该模式并且创建DataFrame
swimmers = spark.createDataFrame(stringCSVRDD, schema)
swimmers.registerTempTable("swimmers")
# 查看DataFrame的行数
print(swimmers.count())

# 2.2. 从变量创建
# 使用自动类型推断的方式创建dataframe
data = [(123, "Katie", 19, "brown"), (234, "Michael", 22, "green"),
        (345, "Simone", 23, "blue")]
Exemple #16
0
 def with_idx(sdf):
     new_schema = StructType(sdf.schema.fields + [StructField("idx", LongType(), False), ])
     return sdf.rdd.zipWithIndex().map(lambda row: row[0] + (row[1],)).toDF(
         schema=new_schema)
Exemple #17
0
def analyze(spark):
    # allSubDir = glob.glob("data/preprocess_test")
    # allcsv = []
    # for subdir in allSubDir:
    #     files = glob.glob(subdir + "*.csv")
    #     allcsv = allcsv + files
    input_file = "data/df.csv"
    mapping_file = "data/preprocess_test/mapping/mapping.csv"
    mapping_df = spark.read \
        .option("header", "true") \
        .option("treatEmptyValuesAsNulls", "true") \
        .option("inferSchema", "true") \
        .option("charset", "UTF-8") \
        .csv(mapping_file)

    fix_mapping_df = mapping_df\
        .withColumn("F_MAMH", when(col("F_MAMH").cast(IntegerType()).isNotNull(), col("F_MAMH") + lit(".0")).otherwise(col("F_MAMH")))\
        .withColumn("F_MAMH_new", when(col("F_MAMH_new").cast(IntegerType()).isNotNull(), col("F_MAMH_new").cast(IntegerType()) + lit(".0")).otherwise(col("F_MAMH_new")))

    # df = spark.read.format("com.crealytics.spark.excel").option("location", input_file) \
    #     .option("useHeader", "True") \
    #     .option("treatEmptyValuesAsNulls", "true") \
    #     .option("inferSchema", "False") \
    #     .option("addColorColumns", "False") \
    #     .load()  # original input file
    df = spark.read \
        .option("header", "true") \
        .option("treatEmptyValuesAsNulls", "true") \
        .option("inferSchema", "true") \
        .option("charset", "UTF-8") \
        .csv(input_file)

    df = df.select("MASV1", "F_MAMH", "F_MAKH", "TKET", "F_NIENKHOA")
    # df = df.groupBy("MASV1", "F_MAMH", "F_MAKH").agg(collect_list("TKET").alias("list_TKET")).withColumn("TKET", col("list_TKET")[0])
    # df = df.filter(df["F_MAKH"] == "MT")
    # print(df.count())
    # df = df.withColumn("MASV1", df["MASV1"].cast(DoubleType()))
    # df = df.withColumn("MASV1", df["MASV1"].cast(IntegerType()))
    # df = df.withColumn("TKET", df["TKET"].cast(DoubleType()))
    # df = df.groupBy("MASV1", "F_MAMH", "F_MAKH").agg(collect_list("TKET").alias("list_TKET"))\
    #     .withColumn("TKET", col("list_TKET")[0]).drop("list_TKET")
    print("Original df count: {}".format(str(df.count())))
    print("Original df distinct SV_MH distinct count: {}".format(
        str(df.select("MASV1", "F_MAMH", "F_MAKH").distinct().count())))
    print("Original df distinct F_MAMH count: {}".format(
        str(df.select("F_MAMH").distinct().count())))
    print("Original df distinct F_MAKH count: {}".format(
        str(df.select("F_MAKH").distinct().count())))
    course_mapping = FullCourseMappingEstimator()\
        .setItemCol("F_MAMH")\
        .setOutputCol("F_MAMH_new")\
        .fit(fix_mapping_df)

    course_filter = FilterCountTransformer(limit=50)\
        .setItemCol("F_MAMH")

    faculty_filter = FilterCountTransformer(limit=500)\
        .setItemCol("F_MAKH")

    get_max_filter = FilterDuplicateUserItemGetMaxTransformer()\
        .setUserCol("MASV1")\
        .setItemCol("F_MAMH")\
        .setValueCol("TKET")

    mapping_output_df = course_mapping.transform(df).withColumn(
        "F_MAMH", col("F_MAMH_new")).select("MASV1", "F_MAMH", "F_MAKH",
                                            "TKET", "F_NIENKHOA")
    mapping_output_df = spark.createDataFrame(mapping_output_df.rdd,
                                              mapping_output_df.schema)

    remove_duplicate_df = get_max_filter.transform(mapping_output_df)

    course_filter_output_df = course_filter.transform(remove_duplicate_df)
    # distinct_mapping_count_df = mapping_output_df.select("MASV1", "F_MAMH", "F_MAKH").distinct().groupBy("F_MAMH").agg(count(lit(1)).alias("count_distinct"))
    # mapping_count_df = mapping_output_df.groupBy("F_MAMH").agg(count(lit(1)).alias("count"))
    # list_MMH = distinct_mapping_count_df.join(mapping_count_df, ["F_MAMH"]).withColumn("chenhlech", col("count") - col("count_distinct"))\
    #     .filter(col("chenhlech") != 0)\
    #     .select("F_MAMH") \
    #     .rdd.flatMap(lambda x: x).collect()

    # print(list_MMH)
    # print(len(list_MMH))
    faculty_filter_output_df = faculty_filter.transform(
        course_filter_output_df)

    print("After mapping MH count: {}".format(str(mapping_output_df.count())))
    print("After mapping SV_MH distinct count: {}".format(
        str(
            mapping_output_df.select("MASV1", "F_MAMH",
                                     "F_MAKH").distinct().count())))
    print("After remove duplicate MH count: {}".format(
        str(remove_duplicate_df.count())))
    print("After remove duplicate SV_MH distinct count: {}".format(
        str(
            remove_duplicate_df.select("MASV1", "F_MAMH",
                                       "F_MAKH").distinct().count())))
    print("After filter MH < 50 data F_MAMH distinct count: {}".format(
        str(course_filter_output_df.select("F_MAMH").distinct().count())))
    print("After filter MH < 50 data SV_MH distinct count: {}".format(
        str(
            course_filter_output_df.select("MASV1", "F_MAMH",
                                           "F_MAKH").distinct().count())))
    print("After filter KH < 500 data F_MAKH distinct count: {}".format(
        str(faculty_filter_output_df.select("F_MAKH").distinct().count())))
    print("After filter KH < 500 data SV_MH distinct count: {}".format(
        str(
            faculty_filter_output_df.select("MASV1", "F_MAMH",
                                            "F_MAKH").distinct().count())))
    faculty_filter_output_df = faculty_filter_output_df.filter(col("F_NIENKHOA") >= 14)\
        .select("MASV1", "F_MAMH", "F_MAKH", "TKET")
    # # split major
    list_faculty = faculty_filter_output_df.select(
        "F_MAKH").distinct().rdd.flatMap(lambda x: x).collect()
    output_path = "preprocess_output_namhoc"
    print(list_faculty)
    for faculty in list_faculty:
        course_filter_faculty = FilterCountTransformer(limit=15) \
            .setItemCol("F_MAMH")
        faculty_filter_df = course_filter_faculty.transform(
            faculty_filter_output_df.filter(col("F_MAKH") == faculty))
        # print(faculty_filter_df.count())
        faculty_filter_df = spark.createDataFrame(faculty_filter_df.rdd,
                                                  faculty_filter_df.schema)
        list_user = faculty_filter_df.select("MASV1").distinct().rdd.flatMap(
            lambda x: x).collect()
        # print(len(list_user))

        train, validation = train_test_split(np.array(list_user),
                                             test_size=0.2,
                                             random_state=1)
        train, test = train_test_split(np.array(train),
                                       test_size=0.25,
                                       random_state=1)
        # print(len(train))
        # print(len(validation))
        # print(len(test))

        user_schema = StructType([StructField("MASV1", LongType())])

        train_data_df = faculty_filter_df.join(
            spark.createDataFrame([[x] for x in train.tolist()],
                                  schema=user_schema), ["MASV1"])
        validation_data_df = faculty_filter_df.join(
            spark.createDataFrame([[x] for x in validation.tolist()],
                                  schema=user_schema), ["MASV1"])
        test_data_df = faculty_filter_df.join(
            spark.createDataFrame([[x] for x in test.tolist()],
                                  schema=user_schema), ["MASV1"])

        # print(train_data_df.count())
        # print(validation_data_df.count())
        # print(test_data_df.count())

        # train_data_df.show()
        # print("new")
        train_data_df.coalesce(1).write.option("header", "true").option(
            "charset", "UTF-8").csv("{}/{}/{}".format(output_path, faculty,
                                                      "train"))
        validation_data_df.coalesce(1).write.option("header", "true").option(
            "charset", "UTF-8").csv("{}/{}/{}".format(output_path, faculty,
                                                      "validation"))
        test_data_df.coalesce(1).write.option("header", "true").option(
            "charset", "UTF-8").csv("{}/{}/{}".format(output_path, faculty,
                                                      "test"))
Exemple #18
0
from pyspark.sql.types import (
    ArrayType,
    BinaryType,
    DoubleType,
    StructType,
    StructField,
    StringType,
    IntegerType,
    LongType,
)

ARCHIVE_ORG_SCHEMA = StructType(
    [
        StructField("created", LongType(), True),
        StructField("d1", StringType(), True),
        StructField("d2", StringType(), True),
        StructField("dir", StringType(), True),
        StructField(
            "files",
            ArrayType(
                StructType(
                    [
                        StructField("bitrate", StringType(), True),
                        StructField("btih", StringType(), True),
                        StructField("crc32", StringType(), True),
                        StructField("format", StringType(), True),
                        StructField("height", StringType(), True),
                        StructField("length", StringType(), True),
                        StructField("license", StringType(), True),
                        StructField("md5", StringType(), True),
                        StructField("mtime", StringType(), True),
Exemple #19
0
class CCSparkJob:

    name = 'CCSparkJob'

    output_schema = StructType([
        StructField("key", StringType(), True),
        StructField("val", LongType(), True)
    ])

    warc_parse_http_header = True

    args = None
    records_processed = None
    warc_input_processed = None
    warc_input_failed = None
    log_level = 'INFO'
    logging.basicConfig(level=log_level, format=LOGGING_FORMAT)

    num_input_partitions = 400
    num_output_partitions = 10

    def parse_arguments(self):
        """ Returns the parsed arguments from the command line """

        description = self.name
        if self.__doc__ is not None:
            description += " - "
            description += self.__doc__
        arg_parser = argparse.ArgumentParser(description=description)

        arg_parser.add_argument("input",
                                help="Path to file listing input paths")
        arg_parser.add_argument("output",
                                help="Name of output table"
                                " (saved in spark.sql.warehouse.dir)")

        arg_parser.add_argument("--num_input_partitions",
                                type=int,
                                default=self.num_input_partitions,
                                help="Number of input splits/partitions")
        arg_parser.add_argument("--num_output_partitions",
                                type=int,
                                default=self.num_output_partitions,
                                help="Number of output partitions")
        arg_parser.add_argument("--local_temp_dir",
                                default=None,
                                help="Local temporary directory, used to"
                                "buffer content from S3")

        arg_parser.add_argument("--log_level",
                                default=self.log_level,
                                help="Logging level")

        self.add_arguments(arg_parser)
        args = arg_parser.parse_args()
        self.validate_arguments(args)
        self.init_logging(args.log_level)

        return args

    def add_arguments(self, parser):
        pass

    def validate_arguments(self, args):
        return True

    def init_logging(self, level=None):
        if level is None:
            level = self.log_level
        else:
            self.log_level = level
        logging.basicConfig(level=level, format=LOGGING_FORMAT)

    def get_logger(self, spark_context=None):
        """Get logger from SparkContext or (if None) from logging module"""
        if spark_context is None:
            return logging.getLogger(self.name)
        return spark_context._jvm.org.apache.log4j.LogManager \
            .getLogger(self.name)

    def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf().setAll((
            ("spark.task.maxFailures", "10"),
            ("spark.locality.wait", "20s"),
            ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
        ))
        sc = SparkContext(appName=self.name, conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.records_processed = sc.accumulator(0)
        self.warc_input_processed = sc.accumulator(0)
        self.warc_input_failed = sc.accumulator(0)

        self.run_job(sc, sqlc)

        sc.stop()

    def log_aggregator(self, sc, agg, descr):
        self.get_logger(sc).info(descr.format(agg.value))

    def log_aggregators(self, sc):
        self.log_aggregator(sc, self.warc_input_processed,
                            'WARC input files processed = {}')
        self.log_aggregator(sc, self.warc_input_failed,
                            'WARC input files failed = {}')
        self.log_aggregator(sc, self.records_processed,
                            'records processed = {}')

    @staticmethod
    def reduce_by_key_func(a, b):
        return a + b

    def run_job(self, sc, sqlc):
        input_data = sc.textFile(self.args.input,
                                 minPartitions=self.args.num_input_partitions)

        output = input_data.mapPartitionsWithIndex(self.process_warcs) \
            .reduceByKey(self.reduce_by_key_func)

        sqlc.createDataFrame(output, schema=self.output_schema) \
            .coalesce(self.args.num_output_partitions) \
            .write \
            .format("parquet") \
            .saveAsTable(self.args.output)

        self.get_logger(sc).info('records processed = {}'.format(
            self.records_processed.value))

    def process_warcs(self, id_, iterator):
        s3pattern = re.compile('^s3://([^/]+)/(.+)')
        base_dir = os.path.abspath(os.path.dirname(__file__))

        # S3 client (not thread-safe, initialize outside parallelized loop)
        no_sign_request = botocore.client.Config(
            signature_version=botocore.UNSIGNED)
        s3client = boto3.client('s3', config=no_sign_request)

        for uri in iterator:
            self.warc_input_processed.add(1)
            if uri.startswith('s3://'):
                self.get_logger().info('Reading from S3 {}'.format(uri))
                s3match = s3pattern.match(uri)
                if s3match is None:
                    self.get_logger().error("Invalid S3 URI: " + uri)
                    continue
                bucketname = s3match.group(1)
                path = s3match.group(2)
                warctemp = TemporaryFile(mode='w+b',
                                         dir=self.args.local_temp_dir)
                try:
                    s3client.download_fileobj(bucketname, path, warctemp)
                except botocore.client.ClientError as exception:
                    self.get_logger().error('Failed to download {}: {}'.format(
                        uri, exception))
                    self.warc_input_failed.add(1)
                    warctemp.close()
                    continue
                warctemp.seek(0)
                stream = warctemp
            elif uri.startswith('hdfs://'):
                self.get_logger().error("HDFS input not implemented: " + uri)
                continue
            else:
                self.get_logger().info('Reading local stream {}'.format(uri))
                if uri.startswith('file:'):
                    uri = uri[5:]
                uri = os.path.join(base_dir, uri)
                try:
                    stream = open(uri, 'rb')
                except IOError as exception:
                    self.get_logger().error('Failed to open {}: {}'.format(
                        uri, exception))
                    self.warc_input_failed.add(1)
                    continue

            no_parse = (not self.warc_parse_http_header)
            try:
                for record in ArchiveIterator(stream,
                                              no_record_parse=no_parse):
                    for res in self.process_record(record):
                        yield res
                    self.records_processed.add(1)
            except ArchiveLoadFailed as exception:
                self.warc_input_failed.add(1)
                self.get_logger().error('Invalid WARC: {} - {}'.format(
                    uri, exception))
            finally:
                stream.close()

    def process_record(self, record):
        raise NotImplementedError('Processing record needs to be customized')

    @staticmethod
    def is_wet_text_record(record):
        """Return true if WARC record is a WET text/plain record"""
        return (record.rec_type == 'conversion'
                and record.content_type == 'text/plain')

    @staticmethod
    def is_wat_json_record(record):
        """Return true if WARC record is a WAT record"""
        return (record.rec_type == 'metadata'
                and record.content_type == 'application/json')
Exemple #20
0
    def _attach_distributed_sequence_column(sdf, column_name):
        """
        >>> sdf = ks.DataFrame(['a', 'b', 'c']).to_spark()
        >>> sdf = InternalFrame._attach_distributed_sequence_column(sdf, column_name="sequence")
        >>> sdf.sort("sequence").show()  # doctest: +NORMALIZE_WHITESPACE
        +--------+---+
        |sequence|  0|
        +--------+---+
        |       0|  a|
        |       1|  b|
        |       2|  c|
        +--------+---+
        """
        scols = [scol_for(sdf, column) for column in sdf.columns]

        spark_partition_column = verify_temp_column_name(
            sdf, "__spark_partition_id__")
        offset_column = verify_temp_column_name(sdf, "__offset__")
        row_number_column = verify_temp_column_name(sdf, "__row_number__")

        # 1. Calculates counts per each partition ID. `counts` here is, for instance,
        #     {
        #         1: 83,
        #         6: 83,
        #         3: 83,
        #         ...
        #     }
        sdf = sdf.withColumn(spark_partition_column, F.spark_partition_id())

        # Checkpoint the DataFrame to fix the partition ID.
        sdf = sdf.localCheckpoint(eager=False)

        counts = map(
            lambda x: (x["key"], x["count"]),
            sdf.groupby(
                sdf[spark_partition_column].alias("key")).count().collect(),
        )

        # 2. Calculates cumulative sum in an order of partition id.
        #     Note that it does not matter if partition id guarantees its order or not.
        #     We just need a one-by-one sequential id.

        # sort by partition key.
        sorted_counts = sorted(counts, key=lambda x: x[0])
        # get cumulative sum in an order of partition key.
        cumulative_counts = [0] + list(
            accumulate(map(lambda count: count[1], sorted_counts)))
        # zip it with partition key.
        sums = dict(
            zip(map(lambda count: count[0], sorted_counts), cumulative_counts))

        # 3. Attach offset for each partition.
        @pandas_udf(LongType(), PandasUDFType.SCALAR)
        def offset(id):
            current_partition_offset = sums[id.iloc[0]]
            return pd.Series(current_partition_offset).repeat(len(id))

        sdf = sdf.withColumn(offset_column, offset(spark_partition_column))

        # 4. Calculate row_number in each partition.
        w = Window.partitionBy(spark_partition_column).orderBy(
            F.monotonically_increasing_id())
        row_number = F.row_number().over(w)
        sdf = sdf.withColumn(row_number_column, row_number)

        # 5. Calculate the index.
        return sdf.select((sdf[offset_column] + sdf[row_number_column] -
                           1).alias(column_name), *scols)
Exemple #21
0
def expected_search_clients_daily_data(define_dataframe_factory):
    # template for the expected results
    factory = define_dataframe_factory(
        map(
            to_field,
            [
                ('client_id', 'a', StringType(), False),
                ('sample_id', '42', StringType(), False),
                ('submission_date', '20170101', StringType(), False),
                ('os', 'windows', StringType(), True),
                ('channel', 'release', StringType(), True),
                ('country', 'DE', StringType(), True),
                ('locale', 'de', StringType(), True),
                ('search_cohort', None, StringType(), True),
                ('app_version', '54.0.1', StringType(), True),
                ('distribution_id', None, StringType(), True),
                ('addon_version', '0.9.5', StringType(), False),
                ('engine', 'google', StringType(), True),
                ('source', 'urlbar', StringType(), True),
                ('tagged-sap', None, LongType(), True),
                ('tagged-follow-on', None, LongType(), True),
                ('tagged_sap', None, LongType(), True),
                ('tagged_follow_on', None, LongType(), True),
                ('sap', 4, LongType(), True),
                # Roughly 2016-01-01
                ('profile_creation_date', 16801, LongType(), False),
                ('default_search_engine', 'google', StringType(), False),
                ('default_search_engine_data_load_path',
                 'jar:[app]/omni.ja!browser/google.xml', StringType(), False),
                ('default_search_engine_data_submission_url',
                 'https://www.google.com/search?q=&ie=utf-8&oe=utf-8&client=firefox-b',
                 StringType(), False),
                ('sessions_started_on_this_day', 1, LongType(), True),
                ('profile_age_in_days', 366, LongType(), True),
                ('subsession_hours_sum', 1.0, DoubleType(), True),
                ('active_addons_count_mean', 2.0, DoubleType(), True),
                ('max_concurrent_tab_count_max', 10, LongType(), True),
                ('tab_open_event_count_sum', 5, LongType(), True),
                ('active_hours_sum', .5, DoubleType(), True),
            ]))

    return factory([
        {
            'client_id': 'b',
            'country': 'US'
        },
        # Covers 5 dupe rows and custom app_version, distribution_id rows
        {
            'app_version': '52.0.3',
            'sap': 28,
            'sessions_started_on_this_day': 7,
            'subsession_hours_sum': 7.0,
            'tab_open_event_count_sum': 35,
            'active_hours_sum': 3.5,
        },
        {
            'engine': 'bing'
        },
        {
            'engine': 'yahoo'
        },
        {
            'client_id': 'c',
            'sap': 0,
            'tagged-sap': None,
            'tagged-follow-on': None,
            'tagged_sap': None,
            'tagged_follow_on': None,
            'source': None,
            'engine': None,
        }
    ])
Exemple #22
0
                                       "org.postgresql:postgresql:42.2.19") \
        .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:log4j.properties "
                                                 "-Dspark.yarn.app.container.log.dir=app-logs "
                                                 "-Dlogfile.name=hello-spark") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
        .config("spark.sql.catalog.spark_catalog", "spark.sql.catalog.spark_catalog")\
        .getOrCreate()

    # conf_out = spark.sparkContext.getConf()
    # print(conf_out.toDebugString())

    logger = Log4j(spark)

    schema = StructType([
        StructField("InvoiceNumber", StringType()),
        StructField("CreatedTime", LongType()),
        StructField("StoreID", StringType()),
        StructField("PosID", StringType()),
        StructField("CashierID", StringType()),
        StructField("CustomerType", StringType()),
        StructField("CustomerCardNo", StringType()),
        StructField("TotalAmount", DoubleType()),
        StructField("NumberOfItems", IntegerType()),
        StructField("PaymentMethod", StringType()),
        StructField("CGST", DoubleType()),
        StructField("SGST", DoubleType()),
        StructField("CESS", DoubleType()),
        StructField("DeliveryType", StringType()),
        StructField(
            "DeliveryAddress",
            StructType([
def process_log_data(spark, input_data_path):
    """
    Summary line. 
    Process log data
  
    Parameters: 
    arg1 (spark object)
    arg2 (Read input from this path which can be local or S3)
  
    Returns: 
    log_df, users_table, time_table, user_listen
    """

    pl_start = time()
    print('Starting to process log data')
    # get filepath to log data file
    log_data = input_data_path

    # read log data file
    log_schema = StructType([
        StructField("artist", StringType()),
        StructField("auth", StringType()),
        StructField("firstName", StringType()),
        StructField("gender", StringType()),
        StructField("itemInSession", LongType()),
        StructField("lastName", StringType()),
        StructField("length", DoubleType()),
        StructField("level", StringType()),
        StructField("location", StringType()),
        StructField("method", StringType()),
        StructField("page", StringType()),
        StructField("registration", DoubleType()),
        StructField("sessionId", LongType()),
        StructField("song", StringType()),
        StructField("status", StringType()),
        StructField("ts", StringType()),
        StructField("userAgent", StringType()),
        StructField("userId", StringType())
    ])

    log_df = spark.read.json(input_data_path, schema=log_schema)

    # Number of songs users listened to during each level
    paid_users = log_df.select(['userId',
                                'level']).filter(log_df['level'] == 'paid')
    paid_users = paid_users.groupby(['userId']).count()
    free_users = log_df.select(['userId',
                                'level']).filter(log_df['level'] == 'free')
    free_users = free_users.groupby(['userId']).count()
    paid_users.createOrReplaceTempView('paid_users')
    free_users.createOrReplaceTempView('free_users')
    user_listen = spark.sql("""
        select a.userId, a.count puCount, b.count fuCount
        from paid_users a join free_users b
        on a.userId = b.userId
        where a.userId != ''
    """)

    # Filter only column page with value "NextSong"
    log_df = log_df.filter(log_df.page == 'NextSong').collect()

    # Convert List to Spark
    log_df = spark.createDataFrame(log_df, schema=log_schema)

    # Convert ts from long to datetime
    convert_ts = udf(
        lambda x: datetime.datetime.fromtimestamp(float(x) / 1000.0),
        TimestampType())
    log_df = log_df.withColumn("ts_converted", convert_ts(log_df.ts))

    # Convert registration from double to long
    log_df = log_df.withColumn("registration_converted",
                               log_df.registration.cast(LongType()))

    pl_et = time() - pl_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Process log files : Read & Transformation', round(pl_et, 2)))

    print('Creating users table')
    temp_start = time()
    # extract columns for users table
    # creating users table with columns user_id, first_name, last_name, gender, level
    users_table = log_df.select(['userId', 'firstName', 'lastName', 'gender', 'level'])\
            .withColumnRenamed('userId', 'user_id')\
            .withColumnRenamed('firstName', 'first_name')\
            .withColumnRenamed('lastName', 'last_name').dropDuplicates()

    pl_et = time() - temp_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Creating users table', round(pl_et, 2)))

    print('Creating user_listen table')
    temp_start = time()
    user_listen.createOrReplaceTempView('user_listen')
    users_table.createOrReplaceTempView('users')
    user_listen = spark.sql("""
    select distinct b.first_name, a.puCount, a.fuCount
    from user_listen a join users b
    on a.userId = b.user_id
    """)
    pl_et = time() - temp_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Creating user_listen table', round(pl_et, 2)))

    # extract columns to create time table
    # Creating time table with columns start_time, hour, day, week, month, year, weekday
    print('Creating time table')
    temp_start = time()
    time_table = log_df.select(['ts_converted'])\
                        .withColumnRenamed('ts_converted','start_time')

    time_table = time_table.withColumn('day', F.dayofmonth('start_time')) \
                          .withColumn('month', F.month('start_time')) \
                          .withColumn('year', F.year('start_time')) \
                          .withColumn('hour', F.hour('start_time')) \
                          .withColumn('minute', F.minute('start_time')) \
                          .withColumn('second', F.second('start_time')) \
                          .withColumn('week', F.weekofyear('start_time')) \
                          .withColumn('weekday', F.dayofweek('start_time')).dropDuplicates()
    pl_et = time() - temp_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Creating time table', round(pl_et, 2)))

    pl_et = time() - pl_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Process log files : Total', round(pl_et, 2)))
    return log_df, users_table, time_table, user_listen
                     "r",
                     encoding='ISO-8859-1',
                     errors='ignore') as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames


spark = get_spark_session('ALSExample')

moviesSchema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])

names = loadMovieNames()

ratings = spark.read.option("sep", "\t").schema(moviesSchema) \
    .csv(f"{SPARK_DATA_PATH}/ml-100k/u.data")

print("Training recommendation model...")

als = ALS().setMaxIter(5).setRegParam(0.01).setUserCol("userID").setItemCol("movieID") \
    .setRatingCol("rating")

model = als.fit(ratings)

# Manually construct a dataframe of the user ID's we want recs for
userID = int(sys.argv[1])
"""This module contains the schema of the song and log data on S3."""

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, LongType

song_schema = (StructType([
    StructField('artist_id', StringType(), True),
    StructField('artist_latitude', DoubleType(), True),
    StructField('artist_location', StringType(), True),
    StructField('artist_longitude', DoubleType(), True),
    StructField('artist_name', StringType(), True),
    StructField('duration', DoubleType(), True),
    StructField('num_songs', LongType(), True),
    StructField('song_id', StringType(), True),
    StructField('title', StringType(), True),
    StructField('year', LongType(), True)
]))

log_schema = (StructType([
    StructField('artist', StringType(), True),
    StructField('auth', StringType(), True),
    StructField('firstName', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('itemInSession', LongType(), True),
    StructField('lastName', StringType(), True),
    StructField('length', DoubleType(), True),
    StructField('level', StringType(), True),
    StructField('location', StringType(), True),
    StructField('method', StringType(), True),
    StructField('page', StringType(), True),
    StructField('registration', DoubleType(), True),
    StructField('sessionId', LongType(), True),
Exemple #26
0
df = spark.range(1000).toDF("nums")
spark.range(5).collect()
df.select(df["nums"] + 10)

df = spark.read.format("json").load("data/flight-data/json/2015-summary.json")
dfS = spark.read.format("json").load(
    "data/flight-data/json/2015-summary.json").schema
dfS
df.printSchema()

from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False, metadata={"hello": "world"}),
])
df = spark.read.format("json").schema(myManualSchema)\
    .load("data/flight-data/json/2015-summary.json")
df

from pyspark.sql.functions import col, column
df.col("count")

from pyspark.sql import Row

myManualSchema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("rank", LongType(), False)
])
Exemple #27
0
from util import get_catlog
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, BooleanType, FloatType, IntegerType, LongType, DateType, TimestampType, Row, \
    StructField, StructType
from config import *
from pyspark.sql import DataFrame
from pyspark import SparkConf, SparkContext, HiveContext, RDD
from transformData import USER_TITLE, exception, logger, execute_func
import sys

TYPE_DICT = {
    "string": StringType(),
    "boolean": BooleanType(),
    "float": FloatType(),
    "int": IntegerType(),
    "long": LongType(),
    "date": DateType(),
    "datetime": TimestampType()
}

TITTLE = USER_TITLE
CONNECT_TABLE_CATELOG = {
    "table": {
        "namespace": "default",
        "name": "ORG_CONNECT_TAB"
    },
    "rowkey": "",
    "domain": "",
    "columns": {
        "org_id": {
            "cf": "rowkey",
Exemple #28
0
    def _select_rows(self, rows_sel):
        from databricks.koalas.series import Series

        if isinstance(rows_sel, Series):
            assert isinstance(rows_sel.spark_type,
                              BooleanType), rows_sel.spark_type
            return rows_sel._scol, None, None
        elif isinstance(rows_sel, slice):
            assert len(self._internal.index_columns) > 0
            if rows_sel.step is not None:
                LocIndexer._raiseNotImplemented("Cannot use step with Spark.")
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                return None, None, None
            elif len(self._internal.index_columns) == 1:
                sdf = self._internal.sdf
                index = self._kdf_or_kser.index
                index_column = index.to_series()
                index_data_type = index_column.spark_type
                start = rows_sel.start
                stop = rows_sel.stop

                # get natural order from '__natural_order__' from start to stop
                # to keep natural order.
                start_and_stop = (sdf.select(
                    index_column._scol, NATURAL_ORDER_COLUMN_NAME
                ).where(
                    (index_column._scol == F.lit(start).cast(index_data_type))
                    | (index_column._scol == F.lit(stop).cast(index_data_type))
                ).collect())

                start = [row[1] for row in start_and_stop if row[0] == start]
                start = start[0] if len(start) > 0 else None

                stop = [row[1] for row in start_and_stop if row[0] == stop]
                stop = stop[-1] if len(stop) > 0 else None

                cond = []
                if start is not None:
                    cond.append(
                        F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast(
                            LongType()))
                if stop is not None:
                    cond.append(
                        F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast(
                            LongType()))

                # if index order is not monotonic increasing or decreasing
                # and specified values don't exist in index, raise KeyError
                if ((start is None and rows_sel.start is not None)
                        or (stop is None and rows_sel.stop is not None)):
                    inc, dec = sdf.select(
                        index_column._is_monotonic()._scol.alias('__increasing__'),
                        index_column._is_monotonic_decreasing()._scol.alias('__decreasing__')) \
                        .select(F.min(F.coalesce('__increasing__', F.lit(True))),
                                F.min(F.coalesce('__decreasing__', F.lit(True)))).first()
                    if start is None and rows_sel.start is not None:
                        start = rows_sel.start
                        if inc is not False:
                            cond.append(index_column._scol >= F.lit(
                                start).cast(index_data_type))
                        elif dec is not False:
                            cond.append(index_column._scol <= F.lit(
                                start).cast(index_data_type))
                        else:
                            raise KeyError(rows_sel.start)
                    if stop is None and rows_sel.stop is not None:
                        stop = rows_sel.stop
                        if inc is not False:
                            cond.append(index_column._scol <= F.lit(stop).cast(
                                index_data_type))
                        elif dec is not False:
                            cond.append(index_column._scol >= F.lit(stop).cast(
                                index_data_type))
                        else:
                            raise KeyError(rows_sel.stop)

                if len(cond) > 0:
                    return reduce(lambda x, y: x & y, cond), None, None
            else:
                LocIndexer._raiseNotImplemented(
                    "Cannot use slice for MultiIndex with Spark.")
        elif is_list_like(rows_sel) and not isinstance(rows_sel, tuple):
            rows_sel = list(rows_sel)
            if len(rows_sel) == 0:
                return F.lit(False), None, None
            elif len(self._internal.index_columns) == 1:
                index_column = self._kdf_or_kser.index.to_series()
                index_data_type = index_column.spark_type
                if len(rows_sel) == 1:
                    return (index_column._scol == F.lit(
                        rows_sel[0]).cast(index_data_type), None, None)
                else:
                    return (index_column._scol.isin([
                        F.lit(r).cast(index_data_type) for r in rows_sel
                    ]), None, None)
            else:
                LocIndexer._raiseNotImplemented(
                    "Cannot select with MultiIndex with Spark.")
        else:
            if not isinstance(rows_sel, tuple):
                rows_sel = (rows_sel, )
            if len(rows_sel) > len(self._internal.index_map):
                raise SparkPandasIndexingError('Too many indexers')

            rows = [
                scol == value
                for scol, value in zip(self._internal.index_scols, rows_sel)
            ]
            return (reduce(lambda x, y: x & y, rows), None,
                    len(self._internal.index_map) - len(rows_sel))
Exemple #29
0
 def _cast_spark_column_timestamp_to_long(self, scol: Column) -> Column:
     return scol.cast(LongType())
Exemple #30
0
 def hour(self) -> ks.Series:
     """
     The hours of the datetime.
     """
     return _wrap_accessor_spark(self, F.hour, LongType()).alias(self.name)