Ejemplo n.º 1
0
def process_log_data(spark, input_data, output_data):
    '''
    Purpose:  Process log data, create users, time, and songplays tables
    :param spark: Spark session
    :param input_data: Input data file path
    :param output_data: Output data file path
    :return:
    '''
    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'
    
    # create schema
    schema = StructType([StructField('artist', StringType(), True),
                         StructField('auth', StringType(), True),
                         StructField('firstName', StringType(), True),
                         StructField('gender', StringType(), True),
                         StructField('itemInSession', IntegerType(), True),
                         StructField('lastName', StringType(), True),
                         StructField('length', DoubleType(), True),
                         StructField('level', StringType(), True),
                         StructField('location', StringType(), True),
                         StructField('method', StringType(), True),
                         StructField('page', StringType(), True),
                         StructField('registration', StringType(), True),
                         StructField('sessionId', IntegerType(), True),
                         StructField('song', StringType(), True),
                         StructField('status', IntegerType(), True),
                         StructField('ts', TimestampType(), True),
                         StructField('userAgent', StringType(), True),
                         StructField('userId', IntegerType(), False)])

    # read log data file
    df = spark.read.json(log_data, schema = schema)
    
    # filter by actions for song plays
    df = df.filter(df.page=="NextSong")

    # extract columns for users table    
    users_table = df.select("userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level") \
                    .dropDuplicates()
    
    # write users table to parquet files
    users_table.write.parquet(output_data + "users/")

    # create timestamp column from original timestamp column
    get_datetime = udf(date_conversion, TimestampType())
    df = df.withColumn("start_time", get_datetime('ts'))
    
    # extract columns to create time table
    time_table = df.select("start_time") \
                    .dropDuplicates() \
                    .withColumn("hour", hour(col("start_time"))) \
    .withColumn("day", day(col("start_time"))) \
    .withColumn("week", week(col('start_time'))) \
    .withColumn("month", month(col('start_time'))) \
    .withColumn("year", year(col('start_time'))) \
    .withColumn("weekday", date_format(col('start_time')))
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year","month").parquet(output_data + "time/")

    # read in song data to use for songplays table
    songs_df = spark.read.parquet(output_data + 'song_data/*/*/*')
    
    artists_df = spark.read.parquet(output_data + 'artists/*')

    # extract columns from joined song and log datasets to create songplays table 
    songs_logs_joined_df = df.join(songs_df, df.song == song_df.title)
    artists_songs_logs_joined_df = songs_logs_joined_df.join(artists_df, songs_logs_joined_df.artist == artists_df.name)
    songplays_table = artists_songs_logs_joined_df.join(time_table, \
                                                        artists_songs_logs_joined_df.ts == time_table.start_time, "left") \
                                                  .drop(artists_songs_logs_joined_df.year)

     songplays_table = songplays_table.select("start_time", "userId as user_id", "level", "song_id", "artist_id", \
                                               "sessionId as session_id", "location", "userAgent as user_agent", "year", \
                                               "month")                                                  
Ejemplo n.º 2
0
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.sql import SparkSession
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, SparkSession)

        from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer
        from pyspark.sql.types import TimestampType
        from pyspark.sql.pandas.types import from_arrow_type, to_arrow_type
        from pyspark.sql.pandas.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        import pyarrow as pa

        # Create the Spark schema from list of names passed in with Arrow types
        if isinstance(schema, (list, tuple)):
            arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)
            struct = StructType()
            for name, field in zip(schema, arrow_schema):
                struct.add(name, from_arrow_type(field.type), nullable=field.nullable)
            schema = struct

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf.iloc[start:start + step] for start in range(0, len(pdf), step))

        # Create list of Arrow (columns, type) for serializer dump_stream
        arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
                      for pdf_slice in pdf_slices]

        jsqlContext = self._wrapped._jsqlContext

        safecheck = self._wrapped._conf.arrowSafeTypeConversion()
        col_by_name = True  # col by name only applies to StructType columns, can't happen here
        ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
Ejemplo n.º 3
0
    def setUpClass(cls):
        from datetime import date, datetime
        from decimal import Decimal

        super(ArrowTests, cls).setUpClass()
        cls.warnings_lock = threading.Lock()

        # Synchronize default timezone between Python and Java
        cls.tz_prev = os.environ.get("TZ", None)  # save current tz if set
        tz = "America/Los_Angeles"
        os.environ["TZ"] = tz
        time.tzset()

        cls.spark.conf.set("spark.sql.session.timeZone", tz)

        # Test fallback
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "false"
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "true"

        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true"
        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false"

        # Enable Arrow optimization in this tests.
        cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
        # Disable fallback by default to easily detect the failures.
        cls.spark.conf.set(
            "spark.sql.execution.arrow.pyspark.fallback.enabled", "false")

        cls.schema_wo_null = StructType([
            StructField("1_str_t", StringType(), True),
            StructField("2_int_t", IntegerType(), True),
            StructField("3_long_t", LongType(), True),
            StructField("4_float_t", FloatType(), True),
            StructField("5_double_t", DoubleType(), True),
            StructField("6_decimal_t", DecimalType(38, 18), True),
            StructField("7_date_t", DateType(), True),
            StructField("8_timestamp_t", TimestampType(), True),
            StructField("9_binary_t", BinaryType(), True),
        ])
        cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True)
        cls.data_wo_null = [
            (
                "a",
                1,
                10,
                0.2,
                2.0,
                Decimal("2.0"),
                date(1969, 1, 1),
                datetime(1969, 1, 1, 1, 1, 1),
                bytearray(b"a"),
            ),
            (
                "b",
                2,
                20,
                0.4,
                4.0,
                Decimal("4.0"),
                date(2012, 2, 2),
                datetime(2012, 2, 2, 2, 2, 2),
                bytearray(b"bb"),
            ),
            (
                "c",
                3,
                30,
                0.8,
                6.0,
                Decimal("6.0"),
                date(2100, 3, 3),
                datetime(2100, 3, 3, 3, 3, 3),
                bytearray(b"ccc"),
            ),
            (
                "d",
                4,
                40,
                1.0,
                8.0,
                Decimal("8.0"),
                date(2262, 4, 12),
                datetime(2262, 3, 3, 3, 3, 3),
                bytearray(b"dddd"),
            ),
        ]
        cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
Ejemplo n.º 4
0
def process_log_data(spark, input_data, output_data):
    """Process user log data creating the tables user, time and songplays

    Args:
        spark (SparkSession): The spark session object
        input_data (str): The input files path
        output_data (str): The output files path
    """
    # read log data file
    LOGGER.info('read log data file')
    log_df = spark.read.json(input_data)

    # filter by actions for song plays
    LOGGER.info('filter by actions for song plays')
    log_df = log_df.where(F.col('page') == 'NextSong')

    # extract columns for users table
    LOGGER.info('extract columns for users table')
    user_table = log_df.select(
        ['userId', 'firstName', 'lastName', 'gender', 'level'])

    # write users table to parquet files
    LOGGER.info('write users table to parquet files')
    user_path = os.path.join(output_data, 'user')
    user_table.coalesce(1).write.mode('overwrite').parquet(user_path)

    # create datetime column from original timestamp column
    LOGGER.info('create datetime column from original timestamp column')
    get_timestamp = F.udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000),
                          TimestampType())
    log_df = log_df.withColumn("start_time", get_timestamp("ts"))

    # extract columns to create time table
    LOGGER.info('extract columns to create time table')
    time_table = log_df.select(
        'start_time',
        F.hour('start_time').alias('hour'),
        F.dayofmonth('start_time').alias('day'),
        F.weekofyear('start_time').alias('weekofyear'),
        F.month('start_time').alias('month'),
        F.year('start_time').alias('year'),
        F.dayofweek('start_time').alias('weekday')).drop_duplicates(
            ['start_time'])

    # write time table to parquet partitioned by year and month
    LOGGER.info('write time table to parquet partitioned by year and month')
    time_table.coalesce(1).write.mode('overwrite')\
        .partitionBy('year', 'month')\
        .parquet(os.path.join(output_data, 'time'))

    # read in song data to use for songplays table
    LOGGER.info('read in song data to use for songplays table')
    song_df = spark.read.parquet(os.path.join(output_data, 'song'))
    artist_df = spark.read.parquet(os.path.join(output_data, 'artist'))

    # join artist and song data
    LOGGER.info('join artist and song data')
    song_df = artist_df.select(['artist_name', 'artist_id'])\
        .join(song_df, on='artist_id', how='inner')

    # extract columns from joined song and log datasets to create songplays
    LOGGER.info('extract columns from joined song and log datasets to create '
                'songplays')
    on_clause = \
        (song_df.title == log_df.song) \
        & (song_df.artist_name == log_df.artist) \
        & (song_df.duration == log_df.length)
    songplays_table = log_df.join(song_df, on_clause, how='inner')

    # select columns and create year and month columns
    LOGGER.info('select columns and create year and month columns')
    songplays_table = songplays_table.select(
        'start_time',
        F.col('userId').alias('user_id'), 'level', 'song_id', 'artist_id',
        F.col('itemInSession').alias('session_id'), 'location',
        F.col('userAgent').alias('user_agent'),
        F.month('start_time').alias('month'),
        F.year('start_time').alias('year'))

    # create songplay_id and drop duplicates by this column
    LOGGER.info('create songplay_id and drop duplicates by this column')
    key_columns = [
        'start_time', 'user_id', 'song_id', 'artist_id', 'session_id'
    ]
    songplays_table = songplays_table.withColumn(
        'songplay_id', F.sha2(F.concat_ws("||", *key_columns),
                              256)).drop_duplicates(['songplay_id'])

    # write songplays table to parquet files partitioned by year and month
    LOGGER.info('write songplays table to parquet partitioned by year/month')
    songplays_table.coalesce(1).write.mode('overwrite')\
        .partitionBy('year', 'month')\
        .parquet(os.path.join(output_data, 'songplays'))
Ejemplo n.º 5
0
from datetime import datetime
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, ArrayType, StringType, TimestampType

# NOTE: please keep this schema definition alphabetized
listen_schema = [
    StructField('artist_mbids', ArrayType(StringType()), nullable=True),
    StructField('artist_msid', StringType(), nullable=False),
    StructField('artist_name', StringType(), nullable=False),
    StructField('listened_at', TimestampType(), nullable=False),
    StructField('recording_mbid', StringType(), nullable=True),
    StructField('recording_msid', StringType(), nullable=False),
    StructField('release_mbid', StringType(), nullable=True),
    StructField('release_msid', StringType(), nullable=True),
    StructField('release_name', StringType(), nullable=True),
    StructField('tags', ArrayType(StringType()), nullable=True),
    StructField('track_name', StringType(), nullable=False),
    StructField('user_name', StringType(), nullable=False),
]

# The field names of the schema need to be sorted, otherwise we get weird
# errors due to type mismatches when creating DataFrames using the schema
# Although, we try to keep it sorted in the actual definition itself, we
# also sort it programmatically just in case
listen_schema = StructType(sorted(listen_schema, key=lambda field: field.name))


def convert_listen_to_row(listen):
    """ Convert a listen to a pyspark.sql.Row object.

    Args: listen (dict): a single dictionary representing a listen
Ejemplo n.º 6
0
    .load()

trackingMessageSchema = StructType() \
    .add("countryCode", StringType()) \
    .add("timestamp", IntegerType()) \
    .add("newCases", IntegerType()) \
    .add("newCuredCases", IntegerType())

trackingMessages = kafkaMessages.select(
    from_json(
        col("value").cast("string"),
        trackingMessageSchema
    ).alias("json")
).select(
    from_unixtime(column('json.timestamp'))
    .cast(TimestampType())
    .alias("parsed_timestamp"),
    column("json.*")
).withColumn("date", to_date(col("parsed_timestamp"))) \
.withColumnRenamed('json.countryCode', 'countryCode') \
.withColumnRenamed('json.newCases', 'countryCode') \
.withColumnRenamed('json.newCuredCases', 'countryCode') \
.withWatermark("parsed_timestamp", windowDuration)


cases = trackingMessages.groupBy(
    window(
        column("parsed_timestamp"),
        windowDuration,
        slidingDuration
    ),
Ejemplo n.º 7
0
    def test_as_spark_type_koalas_dtype(self):
        type_mapper = {
            # binary
            np.character: (np.character, BinaryType()),
            np.bytes_: (np.bytes_, BinaryType()),
            np.string_: (np.bytes_, BinaryType()),
            bytes: (np.bytes_, BinaryType()),
            # integer
            np.int8: (np.int8, ByteType()),
            np.byte: (np.int8, ByteType()),
            np.int16: (np.int16, ShortType()),
            np.int32: (np.int32, IntegerType()),
            np.int64: (np.int64, LongType()),
            np.int: (np.int64, LongType()),
            int: (np.int64, LongType()),
            # floating
            np.float32: (np.float32, FloatType()),
            np.float: (np.float64, DoubleType()),
            np.float64: (np.float64, DoubleType()),
            float: (np.float64, DoubleType()),
            # string
            np.str: (np.unicode_, StringType()),
            np.unicode_: (np.unicode_, StringType()),
            str: (np.unicode_, StringType()),
            # bool
            np.bool: (np.bool, BooleanType()),
            bool: (np.bool, BooleanType()),
            # datetime
            np.datetime64: (np.datetime64, TimestampType()),
            datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()),
            # DateType
            datetime.date: (np.dtype("object"), DateType()),
            # DecimalType
            decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)),
            # ArrayType
            np.ndarray: (np.dtype("object"), ArrayType(StringType())),
            List[bytes]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.character]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[datetime.date]: (np.dtype("object"), ArrayType(DateType())),
            List[np.int8]: (np.dtype("object"), ArrayType(ByteType())),
            List[np.byte]: (np.dtype("object"), ArrayType(ByteType())),
            List[decimal.Decimal]:
            (np.dtype("object"), ArrayType(DecimalType(38, 18))),
            List[float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float32]: (np.dtype("object"), ArrayType(FloatType())),
            List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())),
            List[int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int64]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int16]: (np.dtype("object"), ArrayType(ShortType())),
            List[str]: (np.dtype("object"), ArrayType(StringType())),
            List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())),
            List[datetime.datetime]:
            (np.dtype("object"), ArrayType(TimestampType())),
            List[np.datetime64]:
            (np.dtype("object"), ArrayType(TimestampType())),
            # CategoricalDtype
            CategoricalDtype(categories=["a", "b", "c"]): (
                CategoricalDtype(categories=["a", "b", "c"]),
                LongType(),
            ),
        }

        for numpy_or_python_type, (dtype, spark_type) in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
            self.assertEqual(pandas_on_spark_type(numpy_or_python_type),
                             (dtype, spark_type))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            as_spark_type(np.dtype("object"))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            pandas_on_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            pandas_on_spark_type(np.dtype("object"))
Ejemplo n.º 8
0
def process_log_data(spark, input_data, output_data):
    """
        Description: 
        - Extract log data from JSON files stored in S3 bucket
        - Transforms log data into three separate DataFrames; users_table, time_table and songplays_table
        - Loads them back into s3 as parquet files stored in a separate s3-bucket for analytical purposes

        Arguments:
        - Parameter spark: the instantiated SparkSession
        - Parameter input_data: input path
        - Parameter output_data: output path

        Returns:
        - None
    """
    # get filepath to log data file
    log_data = input_data + "log_data/*.json"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(df["page"] == "NextSong")

    # extract columns for users table
    users_table = df.selectExpr(["userId as user_id"       ,\
                                 "firstName as first_name" ,\
                                 "lastName as last_name"   ,\
                                 "gender"                  ,\
                                 "level"]).dropDuplicates()

    # write users table to parquet files
    users_table.write.mode("overwrite").parquet(output_data + "users.parquet")

    # create timestamp column from original timestamp column
    get_timestamp = udf(
        lambda epoch_time: datetime.fromtimestamp(epoch_time / 1000),
        TimestampType())
    df = df.withColumn('timestamp', get_timestamp(col('ts')))

    # create datetime column from original timestamp column
    get_datetime = udf(
        lambda epoch_time: datetime.fromtimestamp(epoch_time / 1000),
        DateType())
    df = df.withColumn('datetime', get_datetime(col('ts')))

    # extract columns to create time table
    time_table = df.select([hour("timestamp").alias("hour")       ,\
                            dayofmonth("timestamp").alias("day")  ,\
                            weekofyear("timestamp").alias("week") ,\
                            month("timestamp").alias("month")     ,\
                            year("timestamp").alias("year")       ,\
                            date_format("timestamp", 'E').alias("weekday")]).dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + "time.parquet")

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + "songs.parquet")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = song_df.join(df, df.song == song_df.title)\
    .selectExpr(["timestamp as start_time"  ,\
                 "userid as user_id"        ,\
                 "level"                    ,\
                 "song_id"                  ,\
                 "artist_id"                ,\
                 "sessionid as session_id"  ,\
                 "location"                 ,\
                 "useragent as user_agent"]) \
    .withColumn("year", year("start_time"))  \
    .withColumn("month", month("start_time")).dropDuplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.createOrReplaceTempView("songplays")
    spark.sql("""
            SELECT row_number() over (order by start_time asc) as songplay_id,
                   start_time,
                   user_id,
                   level,
                   song_id,
                   artist_id,
                   session_id,
                   location,
                   user_agent,
                   year,
                   month
            FROM songplays
    """).write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + "songplays.parquet")
Ejemplo n.º 9
0
#df_Update.show()

#===== get all the time stamps for each user ========================
#test=df_Update.groupBy(['UserID'])
new = df_Update.groupBy(['UserID']).agg(collect_list("time_date"))
#test.show()
#==========sort time stamps for each user===========================
#func=udf(lambda x:sorted(x.tolist()))


def sorter(l):
    res = sorted(l)
    return [item for item in res]


sort_udf = udf(sorter, ArrayType(TimestampType()))

sorted_df = new.withColumn('sorted', sort_udf(new['collect_list(time_date)']))
print(sorted_df)
#sorted_df.show ()
#====== get the difference between each two successive time stamps.
#step 1 get a column of previous value
#0 if y[0]  else (y[i]-y[i-1] for i in


def diff(l):
    currsession = 1
    output = list()
    for i in range(len(l)):
        if (i == 0):
            output.append(currsession)
data_sqlServer = spark.read.jdbc(url,
                                 "CMP.data_prueba_SqlS",
                                 properties=properties)

#Removing nulls from the data
data_clean = data_sqlServer.filter(
    col("id").isNotNull() & col("company_id").isNotNull()
    & col("amount").isNotNull() & col("status").isNotNull()
    & col("created_at").isNotNull())
data_clean.createOrReplaceTempView("data_clean")

#Transforming data to the final schema
df_clean = spark.sql(
    "Select substr(id,1,24) as id, substr(name,1,130) as name, substr(company_id,1,24) as company_id, cast(amount as decimal(16,2)) as amount, cast(status as varchar(30)) as status, cast(created_at as timestamp) as created_at, cast(paid_at as timestamp) as paid_at from data_clean"
)
data_rdd = df_clean.rdd

data_schema = StructType([ \
    StructField("id",StringType(),False), \
    StructField("company_name",StringType(),True), \
    StructField("company_id",StringType(),False), \
    StructField("amount", DecimalType(18,2), False), \
    StructField("status", StringType(), False), \
    StructField("created_at", TimestampType(), False), \
    StructField("updated_at", TimestampType(), True) \
  ])

data_trasformed = spark.createDataFrame(data=data_rdd, schema=data_schema)

#Writing final eschema in a hive table
data_trasformed.write.mode("append").saveAsTable("final_data_table")
Ejemplo n.º 11
0
def process_log_data(spark, input_data, output_data):
    """Reads data file containing the logs from user activity and loads into parquet files for time_table, users_table and songplays_table in S3.

    Keyword arguments:
    spark -- the spark session
    input_data -- the input file directory in S3
    output_data -- the output parquet file directory in S3
    """

    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*"

    # read log data file
    print("Reading log data file..")
    df = spark.read.format("json").load(log_data)

    # filter by actions for song plays
    df = df.where(df.page == "NextSong")

    # create timestamp column from original timestamp column
    from pyspark.sql.types import TimestampType

    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000),
                        TimestampType())
    df = df.withColumn('start_time', get_timestamp('ts'))

    # create schema-on-read table for loag data
    df.createOrReplaceTempView("log_data")

    # extract columns for users table
    users_table = spark.sql("""
                            SELECT qry.userid,
                                   qry.firstname,
                                   qry.lastname,
                                   qry.gender,
                                   qry.level
                              FROM (
                                    SELECT start_time,
                                           userid,
                                           firstname,
                                           lastname,
                                           gender,
                                           level,
                                           RANK() OVER (PARTITION BY userid ORDER BY start_time DESC) AS rank
                                      FROM log_data
                                   ) AS qry
                            WHERE qry.rank = 1
                           """)

    # write users table to parquet files
    print("Creating users table parquet file..")
    users_table.write.parquet(output_data + "users_table")

    # extract columns to create time table
    time_table = df.select("start_time",
                           hour("start_time").alias('hour'),
                           dayofmonth("start_time").alias('day'),
                           weekofyear("start_time").alias('week'),
                           month("start_time").alias('month'),
                           year("start_time").alias('year'),
                           date_format("start_time",
                                       "u").alias('weekday')).distinct()

    # write time table to parquet files partitioned by year and month
    print("Creating time table parquet file..")
    time_table.write.partitionBy("year",
                                 "month").parquet(output_data + "time_table")

    # read in song, artist and time data to use for songplays table
    print("Reading song parquet file..")
    song_df = spark.read.parquet(
        "s3a://gfkw-dend-udacity/project4/songs_table")
    song_df.createOrReplaceTempView("songs_table")

    print("Reading artist parquet file..")
    artist_df = spark.read.parquet(
        "s3a://gfkw-dend-udacity/project4/artists_table")
    artist_df.createOrReplaceTempView("artists_table")

    print("Reading time parquet file..")
    time_table.createOrReplaceTempView("time_table")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql("""
                                SELECT l.start_time,
                                       t.year,
                                       t.month,
                                       l.userid,
                                       l.level,
                                       q.song_id,
                                       q.artist_id,
                                       l.sessionid,
                                       l.location,
                                       l.useragent
                                  FROM log_data l
                                  JOIN time_table t ON (l.start_time = t.start_time)
                                 LEFT JOIN (
                                           SELECT s.song_id,
                                                  s.title,
                                                  a.artist_id,
                                                  a.artist_name
                                             FROM songs_table s
                                             JOIN artists_table a ON (s.artist_id = a.artist_id)
                                          ) AS q ON (l.song = q.title AND l.artist = q.artist_name)
                               """)

    # write songplays table to parquet files partitioned by year and month
    print("Creating songplays table parquet file..")
    songplays_table.write.partitionBy(
        "year", "month").parquet(output_data + "songplays_table")
Ejemplo n.º 12
0
    def test_verify_type_not_nullable(self):
        import array
        import datetime
        import decimal

        schema = StructType([
            StructField('s', StringType(), nullable=False),
            StructField('i', IntegerType(), nullable=True)
        ])

        class MyObj:
            def __init__(self, **kwargs):
                for k, v in kwargs.items():
                    setattr(self, k, v)

        # obj, data_type
        success_spec = [
            # String
            ("", StringType()),
            (u"", StringType()),
            (1, StringType()),
            (1.0, StringType()),
            ([], StringType()),
            ({}, StringType()),

            # UDT
            (ExamplePoint(1.0, 2.0), ExamplePointUDT()),

            # Boolean
            (True, BooleanType()),

            # Byte
            (-(2**7), ByteType()),
            (2**7 - 1, ByteType()),

            # Short
            (-(2**15), ShortType()),
            (2**15 - 1, ShortType()),

            # Integer
            (-(2**31), IntegerType()),
            (2**31 - 1, IntegerType()),

            # Long
            (-(2**63), LongType()),
            (2**63 - 1, LongType()),

            # Float & Double
            (1.0, FloatType()),
            (1.0, DoubleType()),

            # Decimal
            (decimal.Decimal("1.0"), DecimalType()),

            # Binary
            (bytearray([1, 2]), BinaryType()),

            # Date/Timestamp
            (datetime.date(2000, 1, 2), DateType()),
            (datetime.datetime(2000, 1, 2, 3, 4), DateType()),
            (datetime.datetime(2000, 1, 2, 3, 4), TimestampType()),

            # Array
            ([], ArrayType(IntegerType())),
            (["1", None], ArrayType(StringType(), containsNull=True)),
            ([1, 2], ArrayType(IntegerType())),
            ((1, 2), ArrayType(IntegerType())),
            (array.array('h', [1, 2]), ArrayType(IntegerType())),

            # Map
            ({}, MapType(StringType(), IntegerType())),
            ({
                "a": 1
            }, MapType(StringType(), IntegerType())),
            ({
                "a": None
            }, MapType(StringType(), IntegerType(), valueContainsNull=True)),

            # Struct
            ({
                "s": "a",
                "i": 1
            }, schema),
            ({
                "s": "a",
                "i": None
            }, schema),
            ({
                "s": "a"
            }, schema),
            ({
                "s": "a",
                "f": 1.0
            }, schema),
            (Row(s="a", i=1), schema),
            (Row(s="a", i=None), schema),
            (["a", 1], schema),
            (["a", None], schema),
            (("a", 1), schema),
            (MyObj(s="a", i=1), schema),
            (MyObj(s="a", i=None), schema),
            (MyObj(s="a"), schema),
        ]

        # obj, data_type, exception class
        failure_spec = [
            # String (match anything but None)
            (None, StringType(), ValueError),

            # UDT
            (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError),

            # Boolean
            (1, BooleanType(), TypeError),
            ("True", BooleanType(), TypeError),
            ([1], BooleanType(), TypeError),

            # Byte
            (-(2**7) - 1, ByteType(), ValueError),
            (2**7, ByteType(), ValueError),
            ("1", ByteType(), TypeError),
            (1.0, ByteType(), TypeError),

            # Short
            (-(2**15) - 1, ShortType(), ValueError),
            (2**15, ShortType(), ValueError),

            # Integer
            (-(2**31) - 1, IntegerType(), ValueError),
            (2**31, IntegerType(), ValueError),

            # Float & Double
            (1, FloatType(), TypeError),
            (1, DoubleType(), TypeError),

            # Decimal
            (1.0, DecimalType(), TypeError),
            (1, DecimalType(), TypeError),
            ("1.0", DecimalType(), TypeError),

            # Binary
            (1, BinaryType(), TypeError),

            # Date/Timestamp
            ("2000-01-02", DateType(), TypeError),
            (946811040, TimestampType(), TypeError),

            # Array
            (["1", None], ArrayType(StringType(),
                                    containsNull=False), ValueError),
            ([1, "2"], ArrayType(IntegerType()), TypeError),

            # Map
            ({
                "a": 1
            }, MapType(IntegerType(), IntegerType()), TypeError),
            ({
                "a": "1"
            }, MapType(StringType(), IntegerType()), TypeError),
            ({
                "a": None
            }, MapType(StringType(), IntegerType(),
                       valueContainsNull=False), ValueError),

            # Struct
            ({
                "s": "a",
                "i": "1"
            }, schema, TypeError),
            (Row(s="a"), schema, ValueError),  # Row can't have missing field
            (Row(s="a", i="1"), schema, TypeError),
            (["a"], schema, ValueError),
            (["a", "1"], schema, TypeError),
            (MyObj(s="a", i="1"), schema, TypeError),
            (MyObj(s=None, i="1"), schema, ValueError),
        ]

        # Check success cases
        for obj, data_type in success_spec:
            try:
                _make_type_verifier(data_type, nullable=False)(obj)
            except Exception:
                self.fail("verify_type(%s, %s, nullable=False)" %
                          (obj, data_type))

        # Check failure cases
        for obj, data_type, exp in failure_spec:
            msg = "verify_type(%s, %s, nullable=False) == %s" % (
                obj, data_type, exp)
            with self.assertRaises(exp, msg=msg):
                _make_type_verifier(data_type, nullable=False)(obj)
Ejemplo n.º 13
0
 def test_timestamp_microsecond(self):
     tst = TimestampType()
     self.assertEqual(
         tst.toInternal(datetime.datetime.max) % 1000000, 999999)
Ejemplo n.º 14
0
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, udf
from pyspark.sql.types import (DateType, IntegerType, FloatType, StringType,
                               StructField, StructType, TimestampType)

spark = SparkSession.builder.appName("Read Transactions").getOrCreate()

csv_schema = StructType([
    StructField('customer_id', IntegerType()),
    StructField('amount', FloatType()),
    StructField('purchased_at', TimestampType()),
])

dataframe = spark.read.csv("transactions.csv", schema=csv_schema, header=True)

dataframe.show()

# Add a new column by formatting the original date
formatted_df = dataframe.withColumn(
    "date_string", date_format(col("purchased_at"), 'MM/dd/yyyy'))
formatted_df.show()

# Create a user defined function
string_to_date = \
    udf(lambda text_date: datetime.strptime(text_date, '%m/%d/%Y'),
        DateType())

typed_df = formatted_df.withColumn("date",
                                   string_to_date(formatted_df.date_string))
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = input_data + "log-data/*/*/*.json"

    # read log data file
    print('Input log data json file read started')
    df = spark.read.json(log_data,
                         mode='PERMISSIVE',
                         columnNameOfCorruptRecord='corrupt_record')
    print('Input log data json file read completed')

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    print('users_table data exteaction started \n')
    users_table = df.select("userId", "firstName", "lastName", "gender",
                            "level").drop_duplicates()
    print('users_table data exteaction completed \n')

    # write users table to parquet files
    print('users_table data write started \n')
    users_table.write.parquet(output_data + "users_table/", mode="overwrite")
    print('users_table write Completed')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(int(x) / 1000),
                        TimestampType())
    df = df.withColumn("start_time", get_timestamp("ts"))

    # create datetime column from original timestamp column
    #get_datetime = udf()
    #df =

    # extract columns to create time table
    print('time_table data extraction started \n')
    time_table=df.select('start_time').drop_duplicates() \
        .withColumn('hour', hour(col('start_time'))) \
        .withColumn('day', dayofmonth(col('start_time'))) \
        .withColumn('week', weekofyear(col('start_time'))) \
        .withColumn('month', month(col('start_time'))) \
        .withColumn('year', year(col('start_time'))) \
        .withColumn('weekday', dayofweek(col('start_time')))
    #time_table.show()
    print('time_table data extraction completed \n')

    # write time table to parquet files partitioned by year and month
    print('time_table data write started \n')
    time_table.write.parquet(output_data + "time_table/", mode="overwrite")
    print('time_table data write Completed \n')

    # read in song data to use for songplays table
    song_df = spark.read.format("parquet").option(
        "basePath",
        os.path.join(output_data,
                     "songs/")).load(os.path.join(output_data, "songs/*/*/"))

    # extract columns from joined song and log datasets to create songplays table
    print('songplays_table data extraction started \n')
    songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\
                        .select(monotonically_increasing_id().alias("songplay_id"),
                         col("start_time"),
                         col("userId").alias("user_id"),
                         col("level"),
                         col("song_id"),
                         col("artist_id"),
                         col("sessionId").alias("session_id"),
                         col("location"),
                         col("userAgent").alias("user_agent")
                        )
    print('songplays_table data extraction completed \n')
    # write songplays table to parquet files partitioned by year and month
    print('songplays_table data write started \n')
    songplays_table = songplays_table.write.parquet(output_data + "songplays/",
                                                    mode="overwrite")
    print('songsplay_table data write completed \n')
Ejemplo n.º 16
0
def start_stream(args):
    validate_params(args)
    _, brokers, topic = args

    spark = create_spark_session()

    json = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", brokers) \
        .option("subscribe", topic) \
        .load()

    json.printSchema()

    # Explicitly set schema
    schema = StructType([StructField("symbol", StringType(), False),
                         StructField("timestamp", TimestampType(), False),
                         StructField("price", DoubleType(), False)])

    json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm'Z'"}
    stocks_json = json \
        .select(from_json(F.col("value").cast("string"), schema, json_options).alias("content"))

    stocks_json.printSchema

    stocks = stocks_json.select("content.*")

    ####################################
    # Stream to Parquet
    ####################################
    query = stocks \
        .withColumn('year', year(F.col('timestamp'))) \
        .withColumn('month', month(F.col('timestamp'))) \
        .withColumn('day', dayofmonth(F.col('timestamp'))) \
        .withColumn('hour', hour(F.col('timestamp'))) \
        .withColumn('minute', minute(F.col('timestamp'))) \
        .writeStream \
        .format('parquet') \
        .partitionBy('year', 'month', 'day', 'hour', 'minute') \
        .option('startingOffsets', 'earliest') \
        .option('checkpointLocation', '/dataset/checkpoint') \
        .option('path', '/dataset/streaming.parquet') \
        .trigger(processingTime='30 seconds') \
        .start()

    avg_pricing = stocks \
        .groupBy(F.col("symbol")) \
        .agg(F.avg(F.col("price")).alias("avg_price"))

    ####################################
    # Console Output
    ####################################
    query2 = avg_pricing.writeStream \
        .outputMode('complete') \
        .format("console") \
        .trigger(processingTime="10 seconds") \
        .start()

    ####################################
    # Table in Memory
    ####################################
    # query3 = avg_pricing \
    #     .writeStream \
    #     .queryName("avgPricing") \
    #     .outputMode("complete") \
    #     .format("memory") \
    #     .trigger(processingTime="10 seconds") \
    #     .start()
    #
    # while True:
    #     print('\n' + '_' * 30)
    #     # interactively query in-memory table
    #     spark.sql('SELECT * FROM avgPricing').show()
    #     print(query3.lastProgress)
    #     sleep(10)

    query2.awaitTermination()
    pass
# MAGIC <img alt="Hint" title="Hint" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.3em" src="https://files.training.databricks.com/static/images/icon-light-bulb.svg"/>&nbsp;**Hint:** Use `alias` to alias the name of your columns to the final name you want for them.
# MAGIC <img alt="Hint" title="Hint" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.3em" src="https://files.training.databricks.com/static/images/icon-light-bulb.svg"/>&nbsp;**Hint:** `id` corresponds to `tweet_id` and `user.id` corresponds to `user_id`.

# COMMAND ----------

from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import TimestampType

timestampFormat = "EEE MMM dd HH:mm:ss ZZZZZ yyyy"

tweetDF = fullTweetFilteredDF.select(
    col("id").alias("tweetID"),
    col("user.id").alias("userID"),
    col("lang").alias("language"), col("text"),
    unix_timestamp("created_at",
                   timestampFormat).cast(TimestampType()).alias("createdAt"))

display(tweetDF)

# COMMAND ----------

# TEST - Run this cell to test your solution
from pyspark.sql.types import TimestampType
t = tweetDF.select("createdAt").schema[0]

dbTest("ET1-P-08-07-01", TimestampType(), t.dataType)

print("Tests passed!")

# COMMAND ----------
Ejemplo n.º 18
0
def process_log_data(spark, input_data, output_data):
    """
    Loads the Log files, extracts the data for users table, time table and songplays table then saves it to parquet files
    
    Parameters:
    
    spark: spark session
    input_data: input files path
    output_data: output files path
    """

    # get filepath to log data file
    log_data = input_data

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    users_table = df.select("userId", "firstName", "lastName", "gender",
                            "level").drop_duplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + "users/", mode="overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda ts: datetime.utcfromtimestamp(int(ts) / 1000),
                        TimestampType())
    df = df.withColumn("timestamp", get_timestamp(col("ts")))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda ts: to_date(ts), TimestampType())
    df = df.withColumn("start_time", get_timestamp(col("ts")))

    # extract columns to create time table
    df = df.withColumn("hour",hour("start_time"))\
        .withColumn("day",dayofmonth("start_time"))\
        .withColumn("week",weekofyear("start_time"))\
        .withColumn("month",month("start_time"))\
        .withColumn("year",year("start_time"))\
        .withColumn("weekday",dayofweek("start_time"))

    time_table = df.select("start_time", "hour", "day", "week", "month",
                           "year", "weekday").distinct()

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + "time_table/",
                             mode='overwrite',
                             partitionBy=["year", "month"])

    # read in song data to use for songplays table
    song_df = spark.sql(
        "SELECT DISTINCT song_id, artist_id, artist_name FROM df_songs_table")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, song_df.artist_name == df.artist, "inner") \
        .distinct() \
        .select("start_time", "userId", "level", "sessionId", "location", "userAgent","song_id","artist_id", "month", "year") \
        .withColumn("songplay_id", monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(OUTPUT_DATA + "songplays_table/",
                                  mode="overwrite",
                                  partitionBy=["year", "month"])
Ejemplo n.º 19
0
def main(base_path):
    APP_NAME = "train_spark_mllib_model.py"

    # SparkSession이 없으면 환경 생성
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),
        StructField("CRSArrTime", TimestampType(), True),
        StructField("CRSDepTime", TimestampType(), True),
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Route", StringType(), True),
        StructField("TailNum", StringType(), True),
        StructField("EngineManufacturer", StringType(), True),
        StructField("EngineModel", StringType(), True),
        StructField("Manufacturer", StringType(), True),
        StructField("ManufacturerYear", StringType(), True),
        StructField("OwnerState", StringType(), True),
        StructField("FlightTime", IntegerType(), True),
    ])

    input_path = "{}/data/simple_flight_delay_features_flight_times.json".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # 예정된 도착/출발 시간 추가
    #
    from pyspark.sql.functions import hour
    features_with_hour = features.withColumn("CRSDepHourOfDay",
                                             hour(features.CRSDepTime))
    features_with_hour = features_with_hour.withColumn(
        "CRSArrHourOfDay", hour(features.CRSArrTime))
    features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime",
                              "CRSArrHourOfDay").show()

    #
    # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인
    #
    null_counts = [
        (column,
         features_with_hour.where(features_with_hour[column].isNull()).count())
        for column in features_with_hour.columns
    ]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print("\nNull Value Report")
    print("-----------------")
    print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))

    #
    # pysmark.ml.feature.Bucketizer를 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화
    #
    from pyspark.ml.feature import Bucketizer

    # 구간화 모델 설정
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # 모델 저장
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # 모델 적용
    ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # pyspark.ml.feature의 특징 도구 임포트
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # 범주 필드를 인덱스로 전환
    string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"]
    for column in string_columns:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦
    numeric_columns = [
        "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay",
        "CRSArrHourOfDay", "FlightTime"
    ]
    index_columns = [column + "_index" for column in string_columns]

    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # 수치 벡터 어셈블러 저장
    vector_assembler_path = "{}/models/numeric_vector_assembler_6.0.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # 인덱스 열 제거
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # 확정된 특징 검사
    final_vectorized_features.show()

    #
    # 분류 모델 교차 검증, 훈련, 평가: 4개의 지표에 대해 5번 반복
    #

    from collections import defaultdict
    scores = defaultdict(list)
    feature_importances = defaultdict(list)
    metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
    split_count = 3

    for i in range(1, split_count + 1):
        print("\nRun {} out of {} of test/train splits in cross validation...".
              format(
                  i,
                  split_count,
              ))

        #  테스트/훈련 데이터 분할
        training_data, test_data = final_vectorized_features.randomSplit(
            [0.8, 0.2])

        # 모든 데이터에 대해 랜덤 포레스트 분류 모델 인스턴스화 및 적합
        from pyspark.ml.classification import RandomForestClassifier
        rfc = RandomForestClassifier(
            featuresCol="Features_vec",
            labelCol="ArrDelayBucket",
            predictionCol="Prediction",
            maxBins=4896,
        )
        model = rfc.fit(training_data)

        # 새 모델을 이전 모델 위에 덮어쓰기
        model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.flight_time.bin".format(
            base_path)
        model.write().overwrite().save(model_output_path)

        # 테스트 데이터로 모델 평가
        predictions = model.transform(test_data)

        # 이 테스트/훈련 데이터 분할의 결과를 각 지표별로평가
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        for metric_name in metric_names:
            evaluator = MulticlassClassificationEvaluator(
                labelCol="ArrDelayBucket",
                predictionCol="Prediction",
                metricName=metric_name)
            score = evaluator.evaluate(predictions)

            scores[metric_name].append(score)
            print("{} = {}".format(metric_name, score))

        #
        # 특징 중요도 수집
        #
        feature_names = vector_assembler.getInputCols()
        feature_importance_list = model.featureImportances
        for feature_name, feature_importance in zip(feature_names,
                                                    feature_importance_list):
            feature_importances[feature_name].append(feature_importance)

    #
    # 지표별 평균과 표준편차 평가 및 표로 출력
    #
    import numpy as np
    score_averages = defaultdict(float)

    # 표 데이터 계산
    average_stds = []  # ha
    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        score_averages[metric_name] = average_accuracy

        std_accuracy = np.std(metric_scores)

        average_stds.append((metric_name, average_accuracy, std_accuracy))

    # 표 출력
    print("\nExperiment Log")
    print("--------------")
    print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))

    #
    # 점수를 실행 사이에 존재하는 점수 로그에 유지
    #
    import pickle

    # 점수 로그를 적재하거나 빈 로그를 초기화
    try:
        score_log_filename = "{}/models/score_log.pickle".format(base_path)
        score_log = pickle.load(open(score_log_filename, "rb"))
        if not isinstance(score_log, list):
            score_log = []
    except IOError:
        score_log = []

    # 기존 점수 로그 계산
    score_log_entry = {
        metric_name: score_averages[metric_name]
        for metric_name in metric_names
    }

    # 각 지표에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_log = score_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_log = score_log_entry

    experiment_report = []
    for metric_name in metric_names:
        run_delta = score_log_entry[metric_name] - last_log[metric_name]
        experiment_report.append((metric_name, run_delta))

    print("\nExperiment Report")
    print("-----------------")
    print(tabulate(experiment_report, headers=["Metric", "Score"]))

    # 기존 평균 점수를 로그에 추가
    score_log.append(score_log_entry)

    # 다음 번 실행을 위해 로그 유지
    pickle.dump(score_log, open(score_log_filename, "wb"))

    #
    # 특징 중요도의 변화를 분석하고 보고
    #

    # 각 특징에 대한 평균 계산
    feature_importance_entry = defaultdict(float)
    for feature_name, value_list in feature_importances.items():
        average_importance = sum(value_list) / len(value_list)
        feature_importance_entry[feature_name] = average_importance

    # 특징 중요도를 내림차순으로 정렬하고 출력
    import operator
    sorted_feature_importances = sorted(feature_importance_entry.items(),
                                        key=operator.itemgetter(1),
                                        reverse=True)

    print("\nFeature Importances")
    print("-------------------")
    print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))

    #
    # 이번 실행 결과인 특징 중요도와 이전 실행 결과와 비교
    #

    # 특징 중요도 로그를 적재하거나 빈 로그를 초기화
    try:
        feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
        feature_log = pickle.load(open(feature_log_filename, "rb"))
        if not isinstance(feature_log, list):
            feature_log = []
    except IOError:
        feature_log = []

    # 각 특징에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_feature_log = feature_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_feature_log = defaultdict(float)
        for feature_name, importance in feature_importance_entry.items():
            last_feature_log[feature_name] = importance

    # 변동 값(delta) 계산
    feature_deltas = {}
    for feature_name in feature_importances.keys():
        run_delta = feature_importance_entry[feature_name] - last_feature_log[
            feature_name]
        feature_deltas[feature_name] = run_delta

    # 특징 변동 값을 정렬해 가장 큰 변동이 있는 특징을 먼저 나오게 한다
    import operator
    sorted_feature_deltas = sorted(feature_deltas.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)

    # 정렬된 특징 변동 값 디스플레이
    print("\nFeature Importance Delta Report")
    print("-------------------------------")
    print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))

    # 로그에 기존 평균 변동 값을 추가
    feature_log.append(feature_importance_entry)

    # 다음 실행을 위해 로그 유지
    pickle.dump(feature_log, open(feature_log_filename, "wb"))
Ejemplo n.º 20
0

#now need the most accurate outage time possible for outage event
#now find all the exact outage and restore times using millis
def timeCorrect(time, millis, unplugMillis):
    if (unplugMillis == 0 or millis == None or unplugMillis == None
            or isnan(millis) or isnan(unplugMillis)):
        return time
    elif unplugMillis > millis:
        return time
    else:
        return time - timedelta(
            microseconds=(int(millis) - int(unplugMillis)) * 1000)


udftimeCorrect = udf(timeCorrect, TimestampType())
pw_df = pw_df.withColumn(
    "outage_time", udftimeCorrect("time", "millis", "last_unplug_millis"))
pw_df = pw_df.withColumn("outage_time", F.unix_timestamp("outage_time"))
pw_df = pw_df.withColumn("r_time",
                         udftimeCorrect("time", "millis", "last_plug_millis"))
pw_df = pw_df.withColumn("r_time", F.unix_timestamp("r_time"))

#now denote the end time of the outage for saidi reasons
time_lead = lead("r_time", 1).over(w)
pw_df = pw_df.withColumn("restore_time", time_lead)

#now filter out everything that is not an outage. We should have a time and end_time for every outage
pw_df = pw_df.filter("outage != 0")

# Okay now that we have the outages and times we should join it with the number of sensors reporting above
Ejemplo n.º 21
0
# Create DataFrame representing the stream of input lines
# from connection to localhost:9999
# lines = spark \
# 	.readStream \
# 	.format("socket") \
# 	.option("host", "localhost") \
# 	.option("port", 9999) \
# 	.load()

# Read all the csv files written atomically in a directory
# userA, userB, timestamp, interaction
userSchema = StructType()\
 .add("userA", "integer")\
 .add("userB", "integer")\
 .add("timestamp", TimestampType())\
 .add("interaction","string")

activity = spark \
 .readStream \
 .option("sep", ",") \
 .schema(userSchema) \
 .csv(staging_dir+"/*.csv")  # Equivalent to format("csv").load("/path/to/directory")

#.option("inferSchema", "true")

# Split the lines into words
# words = lines.select(
# 	explode(
# 		split(lines.value, " ")
# 	).alias("word")
Ejemplo n.º 22
0
    exprs = [ "{} as {}".format(field[0],field[1]) for field in fields]
    df = df.selectExpr(*exprs)

    # extract columns for users table
    user_fields = ['user_id', 'first_name', 'last_name', 'gender', 'level']
    users_table = df.select(user_fields).dropDuplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + 'users/')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: x/1000, Dbl())
    df = df.withColumn('ts2', get_timestamp('ts'))

    # create datetime column from original timestamp column
    df = df.withColumn('start_time', from_unixtime('ts2').cast(dataType=TimestampType()))

    # extract columns to create time table
    time_table = df.select('start_time')\
                        .dropDuplicates()\
                        .withColumn('hour', hour(col('start_time')))\
                        .withColumn('day', dayofmonth(col('start_time')))\
                        .withColumn('week', weekofyear(col('start_time')))\
                        .withColumn('month', month(col('start_time')))\
                        .withColumn('year', year(col('start_time')))\
                        .withColumn('weekday', date_format(col('start_time'), 'E')

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + 'time/')

    # read in song data to use for songplays table
Ejemplo n.º 23
0
def from_arrow_type(at: "pa.DataType",
                    prefer_timestamp_ntz: bool = False) -> DataType:
    """Convert pyarrow type to Spark data type."""
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types

    spark_type: DataType
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None:
        spark_type = TimestampNTZType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_duration(at):
        spark_type = DayTimeIntervalType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError(
                "MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = MapType(from_arrow_type(at.key_type),
                             from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError(
                "Nested StructType not supported in conversion from Arrow: " +
                str(at))
        return StructType([
            StructField(field.name,
                        from_arrow_type(field.type),
                        nullable=field.nullable) for field in at
        ])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " +
                        str(at))
    return spark_type
Ejemplo n.º 24
0
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType
from pyspark.sql.types import TimestampType

HOME = '/usr/local/airflow/spark'
spark = SparkSession.builder.appName('Example').getOrCreate()

schema = StructType() \
    .add('uid', IntegerType(), True) \
    .add('page_name', StringType(), True) \
    .add('page_url', StringType(), True) \
    .add('time', TimestampType(), True)

df = spark.read.format('csv').schema(schema).load(f'{HOME}/hitlog.csv')

df.createOrReplaceTempView("hitlog")
result_data = spark.sql("""With registered as ( select uid, time from hitlog
    where page_name == '/register')
    select t.page_name, t.page_url, count(*) as hits from hitlog t join registered r on t.uid==r.uid
    where t.page_url like '%/article%' 
    and t.time <= r.time
    group by t.page_name, t.page_url
    order by hits desc
    LIMIT 3
    """)

result_data.write.format('csv').mode('overwrite').option(
    'sep', ',').save(f'{HOME}/output.csv')
Ejemplo n.º 25
0
def process_log_data(spark, input_data, output_data):
    """
    Reads logs data in a dataframe which is then used to create new dataframes for creating users and time tables.
    Reads songs data and join it with logs dataframe to create a data for songplays table.
    Drop duplicates, rename columns and finally saves all tables in parquet format.

    :param spark: Spark session object
    :param input_data: S3 or local dir containing song data
    :param output_data: Path for parquet output files
    """
    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"  # S3 dir structure
    # log_data = input_data + "log_data/*.json"           # local dir structure

    # read log data file
    logger.info('Reading log data json files')
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df[df['page'] == 'NextSong']

    # extract columns for users table
    users_table = df[['userId', 'firstName', 'lastName', 'gender', 'level']]
    users_table = users_table \
        .withColumnRenamed('userId', 'user_id') \
        .withColumnRenamed('firstName', 'first_name') \
        .withColumnRenamed('lastName', 'last_name') \
        .dropDuplicates()

    # write users table to parquet files
    logger.info('Writing users table in parquet format')
    users_table.write.parquet(output_data + '/tbl_users.parquet')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0),
                        TimestampType())
    df = df.withColumn('start_time', get_timestamp(df.ts))

    # create datetime columns from derived start_time column
    df = df.withColumn('hour', hour(df.start_time))
    df = df.withColumn('day', dayofmonth(df.start_time))
    df = df.withColumn('week', weekofyear(df.start_time))
    df = df.withColumn('month', month(df.start_time))
    df = df.withColumn('year', year(df.start_time))
    df = df.withColumn('weekday', dayofweek(df.start_time))

    # extract columns to create time table
    time_table = df[[
        'start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'
    ]]
    time_table = time_table.dropDuplicates()

    # write time table to parquet files partitioned by year and month
    logger.info(
        'Writing time table partitioned by year and month in parquet format')
    time_table.write.partitionBy('year', 'month').parquet(output_data +
                                                          '/tbl_time.parquet')

    # read in song data to use for songplays table
    logger.info("Reading song data for join")
    song_df = spark.read.json(input_data + 'song_data/*/*/*/*.json')
    song_df = song_df.withColumnRenamed('year', 'song_year')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, song_df.artist_name == df.artist,
                              'inner')
    songplays_table = songplays_table.withColumn(
        "songplay_id", F.monotonically_increasing_id())
    songplays_table = songplays_table[[
        'songplay_id', 'start_time', 'userId', 'level', 'song_id', 'artist_id',
        'sessionId', 'location', 'userAgent', 'month', 'year'
    ]]
    songplays_table = songplays_table \
        .withColumnRenamed('userId', 'user_id') \
        .withColumnRenamed('sessionId', 'session_id') \
        .withColumnRenamed('userAgent', 'user_agent')

    # write songplays table to parquet files partitioned by year and month
    logger.info(
        'Writing songplays table partitioned by year and month in parquet format'
    )
    songplays_table.write.partitionBy(
        'year', 'month').parquet(output_data + '/tbl_songplays.parquet')
Ejemplo n.º 26
0
#For databricks related packages
#./bin/pyspark --packages com.databricks:spark-csv_2.10:1.3.0

#Before Spark 1.4
train = sqlContext.load(source="com.databricks.spark.csv", path = 'PATH/train.csv', header = True,inferSchema = True)
test = sqlContext.load(source="com.databricks.spark.csv", path = 'PATH/test-comb.csv', header = True,inferSchema = True)

#Current Spark 2.1 and ...
from pyspark .sql import SparkSession
spark = SparkSession.builder.master("yarn").getOrCreate()
df = spark.read.csv('hdfs://hadoop-master:9000/index/train.csv',mode="DROPMALFORMED")

#From local
from pyspark.sql.types import StructType,StructField,LongType,StringType,TimestampType
schema=StructType([StructField('', LongType(), True), StructField('col1', LongType(), True), StructField('col2', StringType(), True), StructField('col3', StringType(), True),StructField('col4',TimestampType(),True),StructField('col5',TimestampType(),True),StructField('col6',StringType(),True)])
df = spark.read.csv('file:///index/data_extract_restart2_without_cert/data_refined.csv',,mode="DROPMALFORMED"),schema=schema)

#Creating UDF
def dict(sk):
  new_sk=sk.replace(',','|')#replacing comma by pipe in column col2 and putting the result in column named new_column_name
	return new_sk


udf_dict = udf(dict, StringType())

df.withColumn('new_column_name', udf_dict("col2")).write.csv(path="/index/skill_clean_v3")#col2 is the column to be changed

#Executing SQL queries
df.createOrReplaceTempView("data")
sqlDF = spark.sql("SELECT * FROM data")
Ejemplo n.º 27
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"
    song_data = input_data + "song_data/*/*/*/*.json"
    #song_data = input_data + "song_data/A/A/A/*.json"

    DEBUG and print("Reading log data files from", log_data)

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong') \
            .where(df.ts.isNotNull()) \
            .withColumn("userId", df["userId"].cast(IntegerType())) \
            .withColumn("sessionId", df["sessionId"].cast(IntegerType()))

    DEBUG and print("Preparing users table")

    # extract columns for users table
    users_table = df.select(
        "userId", "firstName", "lastName", "gender",
        "level").where(col("userId").isNotNull()).dropDuplicates(['userId'])

    DEBUG and print("Creating and persisting users table")

    # write users table to parquet files
    users_table.write.parquet(output_data + "users/", mode='overwrite')

    DEBUG and print("Creating and persisting time table")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda ts: datetime.fromtimestamp(ts / 1000),
                        TimestampType())
    df = df.withColumn("start_time", get_timestamp(df.ts))

    # extract columns to create time table
    time_table = df.withColumn("hour", hour(df.start_time)) \
                    .withColumn("day", dayofmonth(df.start_time)) \
                    .withColumn("week", weekofyear(df.start_time)) \
                    .withColumn("month", month(df.start_time)) \
                    .withColumn("year", year(df.start_time)) \
                    .withColumn("weekday", dayofweek(df.start_time)) \
                    .select("start_time", "hour", "day", "week", "month", "year", "weekday") \
                    .dropDuplicates(["start_time"])

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy(["year",
                                  "month"]).parquet(output_data + "times/",
                                                    mode='overwrite')

    DEBUG and print("Creating and persisting songplays table")

    # read in song data to use for songplays table
    song_df = spark.read.json(song_data).select("song_id", "title",
                                                "artist_id", "artist_name")
    action_df = df.select("start_time", "userId", "level", "sessionId",
                          "location", "userAgent", "artist", "song")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = action_df.join(song_df, (action_df.artist == song_df.artist_name) & (action_df.song == song_df.title)) \
                                .select(monotonically_increasing_id().alias("songplay_id"), "start_time", "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent") \
                                .withColumn("month", month(df.start_time)) \
                                .withColumn("year", year(df.start_time))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy(["year", "month"
                                       ]).parquet(output_data + "songplays/",
                                                  mode='overwrite')
Ejemplo n.º 28
0
def process_log_data(spark, input_data, output_data):
    """
        Description: This function loads log_data from S3 and extracts the songs and artist tablesafter processing
        and then write those generated tables to S3 in parquet format. Also output from previous function is used in by spark.read.json command
        
        Parameters:
            spark       : Spark Session
            input_data  : Location of log_data files
            output_data : S3 bucket where extracted tables are written in parquet format.
            
    """
    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.select('userId','firstName', 'lastname', 'gender', 'level').dropDuplicates()\
                    .where(df.userId.isNotNull())

    # write users table to parquet files
    users_table.write.parquet(output + 'users/')

    # create UDF for timestamp column from original timestamp column
    @udf(TimestampType())
    def conv_timestamp(ms):
        return datetime.fromtimestamp(ms / 1000.0)

    # Lets add one more column with correct usable time stamp format
    df = df.withColumn("start_time", conv_timestamp('ts'))

    # Create a dataframe which only has start_time
    log_time_data = df.select('start_time').dropDuplicates()\
                    .where(df.start_time.isNotNull())

    # extract columns to create time table
    time_table = log_time_data.withColumn('hour',hour('start_time'))\
                              .withColumn('day',dayofmonth('start_time'))\
                              .withColumn('week', weekofyear('start_time'))\
                              .withColumn('month', month('start_time'))\
                              .withColumn('year',year('start_time'))\
                              .withColumn("weekday", date_format("start_time", 'E'))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year",
                                 "month").parquet(output_data + 'times/')

    # create a view for the log_data and we already have the view for song_data as song created at the start
    df.createOrReplaceTempView('log_data_filtered_timeformatted')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql(
        """SELECT monotonically_increasing_id() AS songplay_id,
                                  start_time,
                                  userId AS user_id,
                                  level,
                                  song_id,
                                  artist_id,
                                  sessionId AS session_id,
                                  location,
                                  userAgent AS user_agent
                                  FROM  log_data_filtered_timeformatted 
                                  JOIN song 
                                  ON artist = artist_name AND song = title """)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(output_data +
                                                               'songplays/')
Ejemplo n.º 29
0
def process_log_data(spark, input_data, output_data):
    """
    Function that read and transform log_data files to

    save user_table, time_table and songplays_table on S3 (in parquet extension)

    """

    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"  #real path
    # log_data = input_data + "log_data/2018/11/2018-11-12-events.json"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where("page='NextSong'")

    # extract columns for users table
    user_table = df.select(col("userId").cast("int").alias("user_id"),\
                           col("firstName").alias("first_name"),\
                           col("lastName").alias("last_name"),"gender","level")

    user_table = user_table.dropDuplicates()

    # write users table to parquet files
    user_table.write.parquet(output_data + 'users/', 'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: int(x) / 1000, IntegerType())
    df = df.withColumn("timestamp", get_timestamp("ts"))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType())
    df = df.withColumn("datetime", get_datetime("timestamp"))

    #     print(df.limit(5).toPandas().head())
    # extract columns to create time table
    time_table = df.select(col("timestamp").alias("start_time"),\
                           hour("datetime").alias("hour"),\
                           dayofmonth("datetime").alias("day"),\
                           weekofyear("datetime").alias("week"),\
                           month("datetime").alias("month"),\
                           year("datetime").alias("year"),\
                           date_format('datetime','E').alias('weekday')
                          )
    #     print(time_table.limit(5).toPandas().head())
    time_table = time_table.dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year",
                                 "month").parquet(output_data + 'time/',
                                                  'overwrite')

    # read in song data to use for songplays table
    #     song_df = spark.read.json(input_data + "song_data/A/B/C/TRABCEI128F424C983.json")
    song_df = spark.read.json(input_data + "song_data/*/*/*/*.json")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.alias("a").join(song_df.alias("b"),\
                                         (df.song == song_df.title) & (df.artist == song_df.artist_name) & (df.length == song_df.duration)).\
    select(col("a.ts").alias("start_time"),col("a.userId").cast("int").alias("a.user_id"),"level",\
           col("a.sessionId").alias("session_id"),"a.location","a.userAgent","b.song_id","b.artist_id")

    get_start_time = udf(lambda x: datetime.fromtimestamp(int(x) / 1000),
                         TimestampType())
    songplays_table = songplays_table.withColumn("start_time",
                                                 get_start_time("start_time"))
    songplays_table = songplays_table.withColumn("songplay_id",
                                                 monotonically_increasing_id())
    songplays_table = songplays_table.withColumn("year", year("start_time"))
    songplays_table = songplays_table.withColumn("month", month("start_time"))

    #     print(songplays_table.limit(5).toPandas().head())

    songplays_table = songplays_table.dropDuplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(
        output_data + 'songplays/', 'overwrite')
Ejemplo n.º 30
0
def cal_performance(date, period, input_batch, output_batch):
    # 得到历史交易日
    hist_dt = fetch_com_dt_hist(date)
    pef_horizions = {
        '1w': hist_dt.loc['B1W'].strftime('%Y%m%d'),
        '1m': hist_dt.loc['B1M'].strftime('%Y%m%d'),
        '3m': hist_dt.loc['B3M'].strftime('%Y%m%d'),
        '6m': hist_dt.loc['B6M'].strftime('%Y%m%d'),
        '1y': hist_dt.loc['B1Y'].strftime('%Y%m%d'),
        '3y': hist_dt.loc['B3Y'].strftime('%Y%m%d'),
        '5y': hist_dt.loc['B5Y'].strftime('%Y%m%d')
    }
    if period == 'all':
        start = None
    elif period == '1w':
        start = pef_horizions[period]
    elif period == '1m':
        start = pef_horizions[period]
    elif period == '3m':
        start = pef_horizions[period]
    elif period == '6m':
        start = pef_horizions[period]
    elif period == '1y':
        start = pef_horizions[period]
    elif period == '3y':
        start = pef_horizions[period]
    elif period == '5y':
        start = pef_horizions[period]
    ss = SparkSession \
        .builder \
        .appName(app_name + '_' + str(date) + '_' + period + '_' + str(is_debug)) \
        .getOrCreate()
    ss.sparkContext.setLogLevel('WARN')
    # 从csv读取数据 并进行格式转换
    schema = StructType([
        StructField('date', TimestampType(), True),
        StructField('sec_id', StringType(), True),
        StructField('nav', FloatType(), True),
        StructField('ret', FloatType(), True),
        StructField('stock', FloatType(), True),
        StructField('treasury', FloatType(), True),
        StructField('credit', FloatType(), True),
        StructField('bench_ret', FloatType(), True),
        StructField('fnd_category', IntegerType(), True),
    ])
    # ret_all_spark_df = ss.read.csv(data_source_csv_path + date + '/' + str(input_batch) + '/ret_all.csv', header=True,
    #                                schema=schema)
    ret_all_spark_df = ss.read.csv(data_source_csv_path +
                                   '20200320/1/ret_all.csv',
                                   header=True,
                                   schema=schema)
    # debug模式下只取部分基金
    if is_debug:
        logging.info('use debug')
        # sec_id_list = ['000006JK', '000028JK', '000134JK', '000135JK']
        # sec_id_list = ['005503JK', '005368JK', '004892JK', '150066JK',
        #         '000189JK', '000270JK', '000327JK']
        # sec_id_list = ['150066JK']
        # sec_id_list = ['006382JK']
        today_spark_df = ret_all_spark_df.filter(
            ret_all_spark_df.date == datetime.strptime(date, '%Y%m%d'))
        rank_w = Window.orderBy('sec_id')
        today_spark_df = today_spark_df.withColumn(
            'row_no',
            func.row_number().over(rank_w))
        today_spark_df = today_spark_df.filter(
            today_spark_df.row_no <= 100).select('sec_id')
        ret_all_spark_df = ret_all_spark_df.join(today_spark_df,
                                                 on='sec_id',
                                                 how='inner')
        # ret_all_spark_df = ret_all_spark_df[ret_all_spark_df.sec_id.isin(sec_id_list)]
    else:
        logging.info('use release')
    # 只取date以前的数据  切片 date start是%Y%m%d 注意转换为timestamp
    ret_all_spark_df = ret_all_spark_df[
        ret_all_spark_df.date <= datetime.strptime(date, '%Y%m%d')]
    # date 小于最后一天
    w = Window.partitionBy('sec_id').orderBy('date').rowsBetween(
        Window.unboundedPreceding, Window.unboundedFollowing)
    ret_all_spark_df = ret_all_spark_df.withColumn('the_last_date',
                                                   func.last('date').over(w))
    ret_all_spark_df = ret_all_spark_df.where(
        ret_all_spark_df.the_last_date >= datetime.strptime(date, '%Y%m%d'))
    if period == 'all':
        # 自定义函数
        udf_mean = func.udf(lambda x: float(pd.Series(x).mean()), FloatType())
        udf_std = func.udf(lambda x: float(pd.Series(x).std()), FloatType())
        udf_min = func.udf(lambda x: float(pd.Series(x).min()), FloatType())
        udf_max = func.udf(lambda x: float(pd.Series(x).max()), FloatType())
        udf_p25 = func.udf(lambda x: float(pd.Series(x).quantile(0.25)),
                           FloatType())
        udf_median = func.udf(lambda x: float(pd.Series(x).median()),
                              FloatType())
        udf_p75 = func.udf(lambda x: float(pd.Series(x).quantile(0.75)),
                           FloatType())
        udf_skew = func.udf(lambda x: float(pd.Series(x).skew()), FloatType())
        udf_kurt = func.udf(lambda x: float(pd.Series(x).kurt()), FloatType())
        udf_start = func.udf(lambda x: str(x[0].strftime('%Y%m%d')),
                             StringType())
        udf_end = func.udf(lambda x: str(x[-1].strftime('%Y%m%d')),
                           StringType())
        udf_cagr = func.udf(lambda x: float(Measure.cal_cagr(pd.Series(x))),
                            FloatType())
        udf_cumret = func.udf(
            lambda x: float(Measure.cal_cumret(pd.Series(x))), FloatType())
        udf_standard_deviation = func.udf(
            lambda x: float(Measure.cal_standard_deviation(pd.Series(x))),
            FloatType())
        udf_max_drawdown = func.udf(
            lambda x, y: float(Measure.cal_max_drawdown(pd.Series(x, index=y))
                               ), FloatType())
        udf_sharpe = func.udf(
            lambda x: float(Measure.cal_sharpe(pd.Series(x))), FloatType())
        udf_downside_deviation = func.udf(
            lambda x: float(Measure.cal_downside_deviation(pd.Series(x))),
            FloatType())
        udf_alpha = func.udf(
            lambda x, y, z, w, f: float(
                Measure.cal_alpha(
                    pd.Series(x),
                    pd.DataFrame({
                        'stock': y,
                        'treasury': z,
                        'credit': w
                    }), f)), FloatType())
        udf_marketbeta = func.udf(
            lambda x, y: float(
                Measure.cal_marketbeta(pd.Series(x), pd.Series(y))),
            FloatType())
        udf_information = func.udf(
            lambda x, y: float(
                Measure.cal_information(pd.Series(x), pd.Series(y))),
            FloatType())
        udf_treynor = func.udf(
            lambda x, y: float(Measure.cal_treynor(pd.Series(x), pd.Series(y))
                               ), FloatType())
        # 过滤基金数据长度不够
        ret_all_spark_df = ret_all_spark_df.withColumn(
            'fund_length',
            func.count('date').over(w))
        ret_all_spark_df = ret_all_spark_df[
            ret_all_spark_df['fund_length'] >= 2]
        nt_val_spark_df = ret_all_spark_df[
            ret_all_spark_df.date == datetime.strptime(date, '%Y%m%d')].select(
                'sec_id', 'nav').withColumnRenamed('nav', 'nt_val')
        # 做一下排序 保证ret按date有序  否則date在collect_list后不是順序
        ret_all_spark_df = ret_all_spark_df.withColumn('ret_list',func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('ret').over(w)))\
            .withColumn('stock_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('stock').over(w)))\
            .withColumn('treasury_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('treasury').over(w)))\
            .withColumn('credit_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('credit').over(w)))\
            .withColumn('date_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(datetime.strptime('2020-03-06','%Y-%m-%d')))).otherwise(func.collect_list('date').over(w)))
        nav_agg_part_1 = ret_all_spark_df[
            ret_all_spark_df.date == ret_all_spark_df.the_last_date].select(
                'sec_id', 'ret_list', 'stock_ret_list', 'treasury_ret_list',
                'credit_ret_list', 'date_list', 'fnd_category')
        if is_debug:
            nav_agg_part_1.show()
        # 后面不需要用到ret_all_spark_df 把所有列全部drop掉
        ret_all_spark_df = ret_all_spark_df.drop(
            'sec_id', 'date', 'nav', 'ret', 'stock', 'treasury', 'credit',
            'bench_ret', 'fnd_category', 'the_last_date', 'fund_length',
            'ret_list', 'stock_ret_list', 'treasury_ret_list',
            'credit_ret_list', 'date_list')
        if is_debug:
            ret_all_spark_df.show()
        # 取当前日净值
        nav_agg_part_1 = nav_agg_part_1.join(nt_val_spark_df,
                                             on=['sec_id'],
                                             how='left')
        nav_agg_part_1 = nav_agg_part_1.withColumn('ret_mean', udf_mean('ret_list')) \
            .withColumn('ret_std', udf_std('ret_list')) \
            .withColumn('ret_min', udf_min('ret_list')) \
            .withColumn('ret_max', udf_max('ret_list')) \
            .withColumn('ret_p25', udf_p25('ret_list')) \
            .withColumn('ret_median', udf_median('ret_list')) \
            .withColumn('ret_p75', udf_p75('ret_list')) \
            .withColumn('ret_skew', udf_skew('ret_list')) \
            .withColumn('ret_kurtosis', udf_kurt('ret_list')) \
            .withColumn('ret_start', udf_start('date_list')) \
            .withColumn('cagr_sf', udf_cagr('ret_list'))\
            .withColumn('cumret_sf', udf_cumret('ret_list'))\
            .withColumn('vol_sf', udf_standard_deviation('ret_list'))\
            .withColumn('md_sf', udf_max_drawdown('ret_list','date_list'))\
            .withColumn('sharpe_sf', udf_sharpe('ret_list'))\
            .withColumn('dvol_sf', udf_downside_deviation('ret_list'))\
            .withColumn('alpha_sf', udf_alpha('ret_list','stock_ret_list','treasury_ret_list','credit_ret_list','fnd_category'))\
            .withColumn('beta_sf', udf_marketbeta('ret_list','stock_ret_list'))\
            .withColumn('ir_sf', udf_information('ret_list','stock_ret_list'))\
            .withColumn('treynor_sf', udf_treynor('ret_list','stock_ret_list'))
        # drop 掉中间列
        nav_agg_part_1 = nav_agg_part_1.drop('ret_list', 'stock_ret_list',
                                             'treasury_ret_list',
                                             'credit_ret_list', 'date_list',
                                             'fnd_category')
        if is_debug:
            nav_agg_part_1.show()
        if is_write_file:
            nav_agg_part_1.write.option(
                'header',
                'true').mode('overwrite').csv(output_csv_path + str(date) +
                                              "/" + str(output_batch) + "/" +
                                              period)
    else:
        # 自定義函數
        udf_cagr = func.udf(lambda x: float(Measure.cal_cagr(pd.Series(x))),
                            FloatType())
        udf_cumret = func.udf(
            lambda x: float(Measure.cal_cumret(pd.Series(x))), FloatType())
        udf_aar = func.udf(lambda x: float(Measure.cal_aar(pd.Series(x))),
                           FloatType())
        udf_alpha = func.udf(
            lambda x, y, z, w, f: float(
                Measure.cal_alpha(
                    pd.Series(x),
                    pd.DataFrame({
                        'stock': y,
                        'treasury': z,
                        'credit': w
                    }), f)), FloatType())
        udf_standard_deviation = func.udf(
            lambda x: float(Measure.cal_standard_deviation(pd.Series(x))),
            FloatType())
        udf_downside_deviation = func.udf(
            lambda x: float(Measure.cal_downside_deviation(pd.Series(x))),
            FloatType())
        udf_max_drawdown = func.udf(
            lambda x, y: float(Measure.cal_max_drawdown(pd.Series(x, index=y))
                               ), FloatType())
        udf_marketbeta = func.udf(
            lambda x, y: float(
                Measure.cal_marketbeta(pd.Series(x), pd.Series(y))),
            FloatType())
        udf_var = func.udf(lambda x: float(Measure.cal_var(pd.Series(x))),
                           FloatType())
        udf_sharpe = func.udf(
            lambda x: float(Measure.cal_sharpe(pd.Series(x))), FloatType())
        udf_sortino = func.udf(
            lambda x: float(Measure.cal_sortino(pd.Series(x))), FloatType())
        udf_calmar = func.udf(
            lambda x: float(Measure.cal_calmar(pd.Series(x))), FloatType())
        udf_omega = func.udf(lambda x: float(Measure.cal_omega(pd.Series(x))),
                             FloatType())
        udf_information = func.udf(
            lambda x, y: float(
                Measure.cal_information(pd.Series(x), pd.Series(y))),
            FloatType())
        udf_treynor = func.udf(
            lambda x, y: float(Measure.cal_treynor(pd.Series(x), pd.Series(y))
                               ), FloatType())
        udf_m_square = func.udf(
            lambda x, y: float(Measure.cal_m_square(pd.Series(x), pd.Series(y))
                               ), FloatType())
        udf_sterling = func.udf(
            lambda x: float(Measure.cal_sterling(pd.Series(x))), FloatType())
        udf_burke = func.udf(lambda x: float(Measure.cal_burke(pd.Series(x))),
                             FloatType())
        udf_tail = func.udf(lambda x: float(Measure.cal_tail(pd.Series(x))),
                            FloatType())
        udf_rachev = func.udf(
            lambda x: float(Measure.cal_rachev(pd.Series(x))), FloatType())
        udf_stability = func.udf(
            lambda x: float(Measure.cal_stability(pd.Series(x))), FloatType())
        udf_min_monthly_return = func.udf(
            lambda x, y: float(
                Measure.cal_min_monthly_return(pd.Series(x, index=y))),
            FloatType())
        udf_max_monthly_return = func.udf(
            lambda x, y: float(
                Measure.cal_max_monthly_return(pd.Series(x, index=y))),
            FloatType())
        udf_monthly_odds = func.udf(
            lambda x, y: float(Measure.cal_monthly_odds(pd.Series(x, index=y))
                               ), FloatType())
        udf_picking = func.udf(
            lambda x, y: float(
                Measure.cal_picking(pd.Series(x), pd.Series(y, name='stock'))),
            FloatType())
        udf_timing = func.udf(
            lambda x, y: float(
                Measure.cal_timing(pd.Series(x), pd.Series(y, name='stock'))),
            FloatType())
        udf_trackerror = func.udf(
            lambda x, y, z: float(
                Measure.cal_trackerror(pd.Series(x), pd.Series(y), z)),
            FloatType())
        # 过滤出开始日期在基金发行日期之后的数据
        ret_all_spark_df = ret_all_spark_df.withColumn(
            'the_first_date',
            func.first('date').over(w))
        ret_all_spark_df = ret_all_spark_df[ret_all_spark_df.the_first_date <=
                                            datetime.strptime(start, '%Y%m%d')]
        # 取start日期之前的数据 切片
        ret_all_spark_df = ret_all_spark_df[
            ret_all_spark_df.date >= datetime.strptime(start, '%Y%m%d')]
        # 过滤基金数据长度不够 又进行了切片 所以需要重新统计基金长度
        ret_all_spark_df = ret_all_spark_df.withColumn(
            'fund_length',
            func.count('date').over(w))
        ret_all_spark_df = ret_all_spark_df[
            ret_all_spark_df['fund_length'] >= 2]
        # 做一下排序 保证ret按date有序  否則date在collect_list后不是順序
        ret_all_spark_df = ret_all_spark_df.withColumn('ret_list',func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('ret').over(w)))\
            .withColumn('stock_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('stock').over(w)))\
            .withColumn('treasury_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('treasury').over(w)))\
            .withColumn('credit_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('credit').over(w))) \
            .withColumn('bench_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('bench_ret').over(w))) \
            .withColumn('date_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(datetime.strptime('2020-03-06','%Y-%m-%d')))).otherwise(func.collect_list('date').over(w)))
        nav_agg_part_2 = ret_all_spark_df[
            ret_all_spark_df.date == ret_all_spark_df.the_last_date].select(
                'sec_id', 'ret_list', 'stock_ret_list', 'treasury_ret_list',
                'credit_ret_list', 'bench_ret_list', 'date_list',
                'fnd_category')
        if is_debug:
            nav_agg_part_2.show()
        # 后面不需要用到ret_all_spark_df 把所有列全部drop掉
        ret_all_spark_df = ret_all_spark_df.drop(
            'sec_id', 'date', 'nav', 'ret', 'stock', 'treasury', 'credit',
            'bench_ret', 'fnd_category', 'the_last_date', 'the_first_date',
            'fund_length', 'ret_list', 'stock_ret_list', 'treasury_ret_list',
            'credit_ret_list', 'bench_ret_list', 'date_list')
        if is_debug:
            ret_all_spark_df.show()
        nav_agg_part_2 = nav_agg_part_2.withColumn('cagr_' + period, udf_cagr('ret_list'))\
            .withColumn('cumret_' + period, udf_cumret('ret_list'))\
            .withColumn('aar_' + period, udf_aar('ret_list'))\
            .withColumn('alpha_' + period, udf_alpha('ret_list','stock_ret_list','treasury_ret_list','credit_ret_list', 'fnd_category'))\
            .withColumn('vol_' + period, udf_standard_deviation('ret_list'))\
            .withColumn('dvol_' + period, udf_downside_deviation('ret_list'))\
            .withColumn('md_' + period, udf_max_drawdown('ret_list', 'date_list'))\
            .withColumn('beta_' + period, udf_marketbeta('ret_list', 'stock_ret_list'))\
            .withColumn('var_' + period, udf_var('ret_list'))\
            .withColumn('sharpe_' + period, udf_sharpe('ret_list'))\
            .withColumn('sortino_' + period, udf_sortino('ret_list'))\
            .withColumn('calmar_' + period, udf_calmar('ret_list'))\
            .withColumn('omega_' + period, udf_omega('ret_list'))\
            .withColumn('ir_' + period, udf_information('ret_list','stock_ret_list'))\
            .withColumn('treynor_' + period, udf_treynor('ret_list','stock_ret_list'))\
            .withColumn('m_square_' + period, udf_m_square('ret_list','stock_ret_list'))\
            .withColumn('sterling_' + period, udf_sterling('ret_list'))\
            .withColumn('burke_' + period, udf_burke('ret_list'))\
            .withColumn('tail_' + period, udf_tail('ret_list'))\
            .withColumn('rachev_' + period, udf_rachev('ret_list'))\
            .withColumn('stability_' + period, udf_stability('ret_list'))
        if period in ['3m', '6m', '1y', '3y', '5y']:
            nav_agg_part_2 = nav_agg_part_2.withColumn('min_monthly_ret_' + period, udf_min_monthly_return('ret_list','date_list'))\
                .withColumn('max_monthly_ret_' + period, udf_max_monthly_return('ret_list','date_list'))\
                .withColumn('monthly_odds_' + period, udf_monthly_odds('ret_list', 'date_list'))
        if period in ['1m', '3m', '6m', '1y', '3y', '5y']:
            nav_agg_part_2 = nav_agg_part_2.withColumn('picking_' + period, udf_picking('ret_list', 'stock_ret_list'))\
                .withColumn('timing_' + period, udf_timing('ret_list', 'stock_ret_list'))\
                .withColumn('te_' + period, udf_trackerror('ret_list', 'bench_ret_list', 'fnd_category'))
        # drop 掉中间列表
        nav_agg_part_2 = nav_agg_part_2.drop('ret_list', 'stock_ret_list',
                                             'treasury_ret_list',
                                             'credit_ret_list',
                                             'bench_ret_list', 'date_list',
                                             'fnd_category')
        if is_debug:
            nav_agg_part_2.show()
        if is_write_file:
            nav_agg_part_2.write.option(
                'header',
                'true').mode('overwrite').csv(output_csv_path + str(date) +
                                              "/" + str(output_batch) + '/' +
                                              period)