def spark_streaming_to_pubsublite(
    project_number: int, location: str, topic_id: str
) -> None:
    # [START pubsublite_spark_streaming_to_pubsublite]
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import array, create_map, col, lit, when
    from pyspark.sql.types import BinaryType, StringType
    import uuid

    # TODO(developer):
    # project_number = 11223344556677
    # location = "us-central1-a"
    # topic_id = "your-topic-id"

    spark = SparkSession.builder.appName("write-app").getOrCreate()

    # Create a RateStreamSource that generates consecutive numbers with timestamps:
    # |-- timestamp: timestamp (nullable = true)
    # |-- value: long (nullable = true)
    sdf = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

    # Transform the dataframe to match the required data fields and data types:
    # https://github.com/googleapis/java-pubsublite-spark#data-schema
    sdf = (
        sdf.withColumn("key", lit("example").cast(BinaryType()))
        .withColumn("data", col("value").cast(StringType()).cast(BinaryType()))
        .withColumnRenamed("timestamp", "event_timestamp")
        # Populate the attributes field. For example, an even value will
        # have {"key1", [b"even"]}.
        .withColumn(
            "attributes",
            create_map(
                lit("key1"),
                array(when(col("value") % 2 == 0, b"even").otherwise(b"odd")),
            ),
        )
        .drop("value")
    )

    # After the transformation, the schema of the dataframe should look like:
    # |-- key: binary (nullable = false)
    # |-- data: binary (nullable = true)
    # |-- event_timestamp: timestamp (nullable = true)
    # |-- attributes: map (nullable = false)
    # |    |-- key: string
    # |    |-- value: array (valueContainsNull = false)
    # |    |    |-- element: binary (containsNull = false)
    sdf.printSchema()

    query = (
        sdf.writeStream.format("pubsublite")
        .option(
            "pubsublite.topic",
            f"projects/{project_number}/locations/{location}/topics/{topic_id}",
        )
        # Required. Use a unique checkpoint location for each job.
        .option("checkpointLocation", "/tmp/app" + uuid.uuid4().hex)
        .outputMode("append")
        .trigger(processingTime="1 second")
        .start()
    )

    # Wait 60 seconds to terminate the query.
    query.awaitTermination(60)
    query.stop()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import *

spark = SparkSession.builder.appName('Diamond_Mukesh').getOrCreate()

rdd1 = sc.textFile("wasb:///data/diamonds.csv")

# Find and remove the header from RDD
Header = rdd1.first()
rdd1 = rdd1.filter(lambda row: row != Header)
for i in rdd1.take(5):
    print(i)

schema = StructType([
    StructField('carat', StringType(), True),
    StructField('cut', StringType(), True),
    StructField('color', StringType(), True),
    StructField('clarity', StringType(), True),
    StructField('depth', StringType(), True),
    StructField('table', StringType(), True),
    StructField('price', StringType(), True),
    StructField('col_x', StringType(), True),
    StructField('col_y', StringType(), True),
    StructField('col_z', StringType(), True)
])
df1 = spark.createDataFrame(rdd1.map(lambda x: x.split(',')), schema)

print(df1.columns)
print(df1.printSchema())
print(df1.count())
class HlyDelaysCalculator:
    """Class to handle calculations and updates to the HlyDelays table

  Aggregates data on delays by date and hour in the US/Eastern timezone
  """

    TableSchema = StructType([
        StructField("DateEST", DateType(), False),
        StructField("HourEST", IntegerType(), False),
        StructField("RouteId", StringType(), True),
        StructField("StopName", StringType(), True),
        StructField("AvgDelay", DoubleType(), False),
        StructField("AvgDist", DoubleType(), False),
        StructField("Cnt", IntegerType(), False),
        StructField("StopLat", DoubleType(), True),
        StructField("StopLon", DoubleType(), True),
        StructField("StopId", StringType(), True)
    ])

    DateHour = StructType([
        StructField("DateEST", DateType(), False),
        StructField("HourEST", IntegerType(), False)
    ])

    @staticmethod
    def datetime_to_datehour(dt):
        """Converts a UTC datetime to a date and hour in the US/Eastern time zone

    Args:
      dt: datetime to convert
    """

        dt = pytz.utc.localize(dt).astimezone(Settings.MBTA_TZ)
        return (dt.date(), dt.hour)

    def __init__(self, spark, dfVPDelays):
        """Initializes the instance

    Args:
      spark: Spark Session object
      dfVPDelays: dataframe containing unaggregated delays for trips and stops
    """

        self.spark = spark
        self.dfVPDelays = dfVPDelays

    def create_result_df(self):
        """Aggregates a delays dataframe so that it has data grouped by
    Date (US/Eastern time), Hour (US/Eastern time), RouteId, and StopName
    """

        udf_datetime_to_datehour = F.udf(
            HlyDelaysCalculator.datetime_to_datehour,
            HlyDelaysCalculator.DateHour)
        dfResult = self.dfVPDelays \
          .withColumn(
            "datehour",
            udf_datetime_to_datehour(self.dfVPDelays.SchedDT)
          )
        dfResult = dfResult.filter("EstDist < 100")
        dfResult = dfResult \
          .withColumn("DateEST", dfResult.datehour.DateEST) \
          .withColumn("HourEST", dfResult.datehour.HourEST) \
          .drop("datehour")
        dfResult = dfResult \
          .groupBy(
            dfResult.DateEST, dfResult.HourEST,
            dfResult.RouteId, dfResult.StopName
          ) \
          .agg(
            F.mean(dfResult.EstDelay).alias("AvgDelay"),
            F.mean(dfResult.EstDist).alias("AvgDist"),
            F.count(F.lit(1)).alias("Cnt"),
            F.first(dfResult.StopLat).alias("StopLat"),
            F.first(dfResult.StopLon).alias("StopLon"),
            F.first(dfResult.StopId).alias("StopId")
          )

        return dfResult

    def group_routes(self, dfHlyDelays):
        """Additionally aggregates an hourly delays dataframe so that it has data
    grouped by Date (US/Eastern time), Hour (US/Eastern time), and RouteId
    """

        dfResult = dfHlyDelays \
          .groupBy(
            dfHlyDelays.DateEST, dfHlyDelays.HourEST, dfHlyDelays.RouteId
          ) \
          .agg(
            (F.sum(dfHlyDelays.AvgDelay * dfHlyDelays.Cnt) /
              F.sum(dfHlyDelays.Cnt)).alias("AvgDelay"),
            (F.sum(dfHlyDelays.AvgDist * dfHlyDelays.Cnt) /
              F.sum(dfHlyDelays.Cnt)).alias("AvgDist"),
            F.sum(dfHlyDelays.Cnt).alias("Cnt")
          )
        return dfResult

    def group_stops(self, dfHlyDelays):
        """Additionally aggregates an hourly delays dataframe so that it has data
    grouped by Date (US/Eastern time), Hour (US/Eastern time), and StopName
    """

        dfResult = dfHlyDelays \
          .groupBy(
            dfHlyDelays.DateEST, dfHlyDelays.HourEST, dfHlyDelays.StopName
          ) \
          .agg(
            (F.sum(dfHlyDelays.AvgDelay * dfHlyDelays.Cnt) /
              F.sum(dfHlyDelays.Cnt)).alias("AvgDelay"),
            (F.sum(dfHlyDelays.AvgDist * dfHlyDelays.Cnt) /
              F.sum(dfHlyDelays.Cnt)).alias("AvgDist"),
            F.sum(dfHlyDelays.Cnt).alias("Cnt"),
            F.first(dfHlyDelays.StopLat).alias("StopLat"),
            F.first(dfHlyDelays.StopLon).alias("StopLon"),
            F.first(dfHlyDelays.StopId).alias("StopId")
          )
        return dfResult

    def group_all(self, dfHlyDelays):
        """Additionally aggregates an hourly delays dataframe so that it has data
    grouped by Date (US/Eastern time) and Hour (US/Eastern time) only
    """

        dfResult = dfHlyDelays \
          .groupBy(
            dfHlyDelays.DateEST, dfHlyDelays.HourEST
          ) \
          .agg(
            (F.sum(dfHlyDelays.AvgDelay * dfHlyDelays.Cnt) /
              F.sum(dfHlyDelays.Cnt)).alias("AvgDelay"),
            (F.sum(dfHlyDelays.AvgDist * dfHlyDelays.Cnt) /
              F.sum(dfHlyDelays.Cnt)).alias("AvgDist"),
            F.sum(dfHlyDelays.Cnt).alias("Cnt")
          )
        return dfResult

    def update_s3(self, dfHlyDelays, pqDate):
        """Saves a delays dataframe to the S3 in parquet
    """

        s3Mgr = s3.S3Mgr()
        mxdstr = '0' if Settings.MaxAbsDelay <= 0 else str(
            Settings.MaxAbsDelay)
        pfx = f"HlyDelays{mxdstr}/{pqDate.strftime('%Y%m%d.pq')}"
        if s3Mgr.prefix_exists(pfx):
            s3Mgr.delete_prefix(pfx)
            time.sleep(5)

        dfHlyDelays = dfHlyDelays \
          .withColumn(
            'route_stop',
            F.concat(
              dfHlyDelays.RouteId, F.lit(':::'),
              F.lit('['), dfHlyDelays.StopName, F.lit(']')
            )
          )
        dfHlyDelays = dfHlyDelays \
          .groupBy(dfHlyDelays.route_stop) \
          .agg(
            F.collect_list(
              F.struct(
                dfHlyDelays.DateEST, dfHlyDelays.HourEST,
                dfHlyDelays.AvgDelay, dfHlyDelays.AvgDist, dfHlyDelays.Cnt
              )
            ).alias('vals_unsorted')
          )

        udf_ret_type = ArrayType(
            StructType([
                StructField("DateEST", DateType(), False),
                StructField("HourEST", IntegerType(), False),
                StructField("AvgDelay", DoubleType(), False),
                StructField("AvgDist", DoubleType(), False),
                StructField("Cnt", IntegerType(), False)
            ]))
        udf_sort_vals = F.udf(
            lambda vals: list(
                sorted(vals, key=lambda r: (r.DateEST, r.HourEST))),
            udf_ret_type)
        dfHlyDelays = dfHlyDelays \
          .withColumn('vals', udf_sort_vals(dfHlyDelays.vals_unsorted)) \
          .drop('vals_unsorted')

        dfHlyDelays.printSchema()

        s3_path = "s3a://%s/%s" % (Settings.S3BucketName, pfx)
        dfHlyDelays.write.mode("overwrite").parquet(s3_path)
Beispiel #4
0
 def my_schema() -> StructType:
     return StructType([
         StructField("name", StringType()),
         StructField("scheduling",
                     AutoMapperElasticSearchSchedule.my_schema()),
     ])
class ExtractHostLinksJob(ExtractLinksJob):
    '''Extract links from WAT files and redirects from WARC files,
     extract the host names, reverse the names (example.com -> com.example)
     and save the pairs <from_host, to_host>.'''

    name = "ExtrHostLinks"
    output_schema = StructType([
        StructField("s", StringType(), True),
        StructField("t", StringType(), True)
    ])
    num_input_partitions = 32
    num_output_partitions = 16

    # match global links
    # - with URL scheme, more restrictive than specified in
    #   https://tools.ietf.org/html/rfc3986#section-3.1
    # - or starting with //
    #   (all other "relative" links are within the same host)
    global_link_pattern = re.compile('^(?:[a-z][a-z0-9]{1,5}:)?//',
                                     re.IGNORECASE)

    # match IP addresses
    # - including IPs with leading `www.' (stripped)
    ip_pattern = re.compile('^(?:www\.)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\Z')

    # valid host names, relaxed allowing underscore, allowing also IDNs
    # https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_hostnames
    host_part_pattern = re.compile('^[a-z0-9]([a-z0-9_-]{0,61}[a-z0-9])?\Z',
                                   re.IGNORECASE)

    @staticmethod
    def get_surt_host(url):
        try:
            host = urlparse(url).hostname
        except:
            # self.get_logger().debug("Failed to parse URL {}: {}".format(url, e))
            return None
        if host is None:
            return None
        host = host.strip().lower()
        if len(host) < 1 or len(host) > 253:
            return None
        if ExtractHostLinksJob.ip_pattern.match(host):
            return None
        parts = host.split('.')
        if parts[-1] == '':
            # trailing dot is allowed, strip it
            parts = parts[0:-1]
        if len(parts) > 2 and parts[0] == 'www':
            # strip leading 'www' to reduce number of "duplicate" hosts,
            # but leave at least 2 trailing parts (www.com is a valid domain)
            parts = parts[1:]
        if len(parts) <= 1:
            # do not accept single-word hosts, must be at least `domain.tld'
            return None
        for i in range(0, len(parts)):
            part = parts[i]
            if not ExtractHostLinksJob.host_part_pattern.match(part):
                try:
                    idn = idna.encode(part).decode('ascii')
                except (idna.IDNAError, UnicodeError, IndexError, Exception):
                    # self.get_logger().debug("Invalid host name: {}".format(url))
                    return None

                if ExtractHostLinksJob.host_part_pattern.match(idn):
                    parts[i] = idn
                else:
                    # self.get_logger().debug("Invalid host name: {}".format(url))
                    return None
        parts.reverse()
        return '.'.join(parts)

    def yield_links(self, from_url, base_url, links, url_attr='url'):
        from_host = ExtractHostLinksJob.get_surt_host(from_url)
        if from_host is None:
            return
        target_hosts = set()
        inner_host_links = 0
        for l in links:
            if l is None:
                continue
            if url_attr in l:
                link = l[url_attr]
                if self.global_link_pattern.match(link):
                    try:
                        thost = ExtractHostLinksJob.get_surt_host(link)
                        if thost is None:
                            pass  # no host, e.g., http:///abc/, file:///C:...
                        else:
                            target_hosts.add(thost)
                    except ValueError:
                        pass
                else:
                    inner_host_links += 1
        for t in target_hosts:
            if t != from_host:
                yield from_host, t
        if inner_host_links > 0 and base_url is not None:
            base_host = ExtractHostLinksJob.get_surt_host(base_url)
            if base_host is not None and base_host != from_host:
                # any internal link becomes an external link
                yield from_host, base_host

    def yield_redirect(self, src, target, http_status_line):
        if src == target:
            return
        src_host = ExtractHostLinksJob.get_surt_host(src)
        thost = ExtractHostLinksJob.get_surt_host(target)
        if thost is None or src_host is None or src_host == thost:
            return
        yield src_host, thost
Beispiel #6
0
def create_unified_log(spark):

    data = [
        (
            '0000001',
            0,
            '2020-01-01 12:34:56.78',
            'travel',
            '1',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            1,
            '2020-01-01 12:34:56.78',
            'travel',
            '1',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            1,
            '2020-01-01 12:34:56.78',
            'travel',
            '1',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            1,
            '2020-01-01 12:34:56.78',
            'travel',
            '1',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            0,
            '2020-01-01 12:34:56.78',
            'game-avg',
            '2',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            0,
            '2020-01-01 12:34:56.78',
            'game-avg',
            '2',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            1,
            '2020-01-01 12:34:56.78',
            'game-avg',
            '2',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            1,
            '2020-01-01 12:34:56.78',
            'game-avg',
            '2',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            0,
            '2020-01-01 12:34:56.78',
            'reading',
            '3',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            0,
            '2020-01-01 12:34:56.78',
            'reading',
            '3',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            0,
            '2020-01-01 12:34:56.78',
            'reading',
            '3',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            1,
            '2020-01-01 12:34:56.78',
            'reading',
            '3',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577836800,
            1577910896,
            '2020-01-01',
            '1',
        ),
        (
            '0000001',
            0,
            '2020-01-02 12:34:56.78',
            'travel',
            '1',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577923200,
            1577997296,
            '2020-01-02',
            '1',
        ),
        (
            '0000001',
            1,
            '2020-01-02 12:34:56.78',
            'travel',
            '1',
            'splash',
            'WIFI',
            0,
            0,
            '1000',
            1577923200,
            1577997296,
            '2020-01-02',
            '1',
        ),
        (
            '0000001',
            0,
            '2020-01-03 12:34:56.78',
            'travel',
            '1',
            'native',
            '4G',
            0,
            0,
            '1001',
            1578009600,
            1578083696,
            '2020-01-03',
            '1',
        ),
        (
            '0000001',
            1,
            '2020-01-03 12:34:56.78',
            'travel',
            '1',
            'native',
            '4G',
            0,
            0,
            '1001',
            1578009600,
            1578083696,
            '2020-01-03',
            '1',
        ),
        (
            '0000002',
            0,
            '2020-01-02 12:34:56.78',
            'travel',
            '1',
            'splash',
            'WIFI',
            1,
            0,
            '1000',
            1577923200,
            1577997296,
            '2020-01-02',
            '1',
        ),
        (
            '0000002',
            0,
            '2020-01-02 12:34:56.78',
            'travel',
            '1',
            'splash',
            'WIFI',
            1,
            0,
            '1000',
            1577923200,
            1577997296,
            '2020-01-02',
            '1',
        ),
        (
            '0000002',
            0,
            '2020-01-02 12:34:56.78',
            'travel',
            '1',
            'splash',
            'WIFI',
            1,
            0,
            '1000',
            1577923200,
            1577997296,
            '2020-01-02',
            '1',
        ),
        (
            '0000003',
            1,
            '2020-01-03 12:34:56.78',
            'travel',
            '1',
            'native',
            '4G',
            0,
            1,
            '1001',
            1578009600,
            1578083696,
            '2020-01-03',
            '1',
        ),
        (
            '0000003',
            1,
            '2020-01-03 12:34:56.78',
            'travel',
            '1',
            'native',
            '4G',
            0,
            1,
            '1001',
            1578009600,
            1578083696,
            '2020-01-03',
            '1',
        ),
        (
            '0000003',
            1,
            '2020-01-03 12:34:56.78',
            'travel',
            '1',
            'native',
            '4G',
            0,
            1,
            '1001',
            1578009600,
            1578083696,
            '2020-01-03',
            '1',
        ),
    ]

    schema = StructType([
        StructField('did', StringType(), True),
        StructField('is_click', IntegerType(), True),
        StructField('action_time', StringType(), True),
        StructField('keyword', StringType(), True),
        StructField('keyword_index', StringType(), True),
        StructField('media', StringType(), True),
        StructField('net_type', StringType(), True),
        StructField('gender', IntegerType(), True),
        StructField('age', IntegerType(), True),
        StructField('adv_id', StringType(), True),
        StructField('interval_starting_time', IntegerType(), True),
        StructField('action_time_seconds', IntegerType(), True),
        StructField('day', StringType(), True),
        StructField('did_bucket', StringType(), True),
    ])

    return spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
Beispiel #7
0
def create_trainready_filter_user_data(spark):
    data = [
        (
            0,
            0,
            'normal',
            1,
            [
                u'1578009600', u'1577923200', u'1577836800', u'1577750400',
                u'157766400', u'1577577600', u'1577491200', u'1577404800',
                u'1577318400', u'1577232000'
            ],
            [
                u'travel', u'travel', u'travel', u'travel', u'travel',
                u'travel', u'game-avg', u'travel', u'travel',
                u'travel,game-avg'
            ],
            [u'1', u'1', u'1', u'1', u'1', u'1', u'1', u'1', u'1', u'1,2'],
            [
                u'1:1', u'1:1', u'1:1', u'1:1', u'1:2', u'1:2', u'2:1', u'1:2',
                u'1:2', u'1:2,2:1'
            ],  # 16
            [
                u'1:0', u'1:0', u'1:0', u'1:0', u'1:1', u'1:1', u'2:0', u'1:1',
                u'1:1', u'1:1,2:0'
            ],
            '1',
        ),
        (
            0,
            0,
            'low average show count/few active intervals',
            2,
            [
                u'1578009600', u'1577923200', u'1577836800', u'1577750400',
                u'157766400', u'1577577600', u'1577491200', u'1577404800',
                u'1577318400', u'1577232000'
            ],
            [
                u'travel', u'travel', u'travel', u'travel', u'travel',
                u'travel', u'game-avg', u'travel', u'travel',
                u'travel,game-avg'
            ],
            [u'1', u'1', u'1', u'1', u'1', u'1', u'1', u'1', u'1', u'1,2'],
            [
                u'1:0', u'1:0', u'1:0', u'1:0', u'1:0', u'1:0', u'2:0', u'1:0',
                u'1:0', u'1:0,2:1'
            ],  # 1
            [
                u'1:0', u'1:0', u'1:0', u'1:0', u'1:1', u'1:1', u'2:0', u'1:1',
                u'1:1', u'1:1,2:0'
            ],
            '1',
        ),
        (
            0,
            0,
            'high average show count',
            3,
            [
                u'1578009600', u'1577923200', u'1577836800', u'1577750400',
                u'157766400', u'1577577600', u'1577491200', u'1577404800',
                u'1577318400', u'1577232000'
            ],
            [
                u'travel', u'travel', u'travel', u'travel', u'travel',
                u'travel', u'game-avg', u'travel', u'travel',
                u'travel,game-avg'
            ],
            [u'1', u'1', u'1', u'1', u'1', u'1', u'1', u'1', u'1', u'1,2'],
            [
                u'1:5000', u'1:0', u'1:0', u'1:0', u'1:0', u'1:0', u'2:0',
                u'1:0', u'1:0', u'1:1,2:1'
            ],  # 5002
            [
                u'1:0', u'1:0', u'1:0', u'1:0', u'1:1', u'1:1', u'2:0', u'1:1',
                u'1:1', u'1:1,2:0'
            ],
            '1',
        ),
        (
            0,
            0,
            'sparse impressions',
            4,
            [u'1577232000'],
            [u'travel,game-avg'],
            [u'1,2'],
            [u'1:10,2:10'],  # 20
            [u'1:1,2:0'],
            '1',
        ),
    ]

    schema = StructType([
        StructField('age', IntegerType(), True),
        StructField('gender', IntegerType(), True),
        StructField('did', StringType(), True),
        StructField('did_index', LongType(), True),
        StructField('interval_starting_time', ArrayType(StringType(), True),
                    True),
        StructField('interval_keywords', ArrayType(StringType(), True), True),
        StructField('kwi', ArrayType(StringType(), True), True),
        StructField('kwi_show_counts', ArrayType(StringType(), True), True),
        StructField('kwi_click_counts', ArrayType(StringType(), True), True),
        StructField('did_bucket', StringType(), True),
    ])

    return spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
df.withColumn('med_col2', magic_percentile.over(grp_window))
# method 1
df.groupBy('col1').agg(magic_percentile.alias('med_col2'))



# 从StructField中取出嵌套的Row中的值
from pyspark.sql import Row

df = sc.parallelize([Row(col1=Row(a=1, b="b"))]).toDF()
df.select(df.col1.getField("b")).show()
df.select(df.col1.a).show()

# data type
df.select(df.age.cast("string").alias('ages')).collect()
df.select(df.age.cast(StringType()).alias('ages')).collect()

for i in numeric_features:
    new_df = new_df.withColumn(i+'_buckets', F.col(i+'_buckets').cast(IntegerType()).cast(StringType()))


# 如果列中的值为list或dict,则根据index或key取相应的值
df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
df.select(df.l.getItem(0).alias('first of l'), df.d.getItem("key").alias('value of d')).show()
df.select(df.l[0], df.d["key"]).show()

# order
ds.select("A", "B").orderBy("C", ascending=False).show()
ds.select("A", "B").orderBy(ds.C.desc()).show()
# multi fields
ds.select("A").orderBy(["B", "C"], ascending=[0, 1])
Beispiel #9
0
# train the model
model = ALS.train(
    dfRates.rdd, 20,
    20)  # you could tune these numbers, but these are reasonable choices
print("trained ...")

# use this model to predict what the user would rate accommodations that she has not rated
allPredictions = None
for USER_ID in range(0, 100):
    dfUserRatings = dfRates.filter(
        dfRates.userId == USER_ID).rdd.map(lambda r: r.accoId).collect()
    rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
    pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))
    predictions = model.predictAll(pairsPotential).map(
        lambda p: (str(p[0]), str(p[1]), float(p[2])))
    predictions = predictions.takeOrdered(5, key=lambda x: -x[2])  # top 5
    print("predicted for user={0}".format(USER_ID))
    if (allPredictions == None):
        allPredictions = predictions
    else:
        allPredictions.extend(predictions)

# write the recommendations
schema = StructType([
    StructField("userId", StringType(), True),
    StructField("accoId", StringType(), True),
    StructField("prediction", FloatType(), True)
])
dfToSave = sqlContext.createDataFrame(allPredictions, schema)
dfToSave.write.jdbc(url=jdbcUrl, table='Recommendation', mode='overwrite')
    # Initialize the spark context.
    spark = SparkSession \
        .builder \
        .appName("ParseArxivDataset") \
        .getOrCreate()

    # Loads in input file. It should be in format of:
    #     Json record

    explode_path = "{}/*json".format(sys.argv[1])

    explode_data = spark.read.json(explode_path)

    schema = StructType([
        StructField("id", StringType()),
        StructField("name", StringType()),
        StructField("author_score", DoubleType())
    ])

    author_score = spark.read.csv(sys.argv[2], schema=schema)

    author_score.printSchema()

    inner_join_res = explode_data.join(
        author_score, explode_data.id == author_score.id).drop(author_score.id)

    inner_join_res.printSchema()

    print("count : {}".format(inner_join_res.count()))
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, to_json, col, unbase64, base64, split, expr
from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType

SPARK_HOST = "spark://spark:7077"
KAFKA_HOST = "kafka:19092"

redisMessageSchema = StructType([
    StructField("key", StringType()),
    StructField("value", StringType()),
    StructField("expiredType", StringType()),
    StructField("expiredValue", StringType()),
    StructField("existType", StringType()),
    StructField("ch", StringType()),
    StructField("incr", BooleanType()),
    StructField(
        "zSetEntries",
        ArrayType(
            StructType([
                StructField("element", StringType()),
                StructField("score", StringType())
            ])))
])

customerJSONSchema = StructType([
    StructField("customerName", StringType()),
    StructField("email", StringType()),
    StructField("phone", StringType()),
    StructField("birthDay", StringType()),
    StructField("accountNumber", StringType()),
    StructField("location", StringType())
class OntologyTermHierarchyExtractor(PySparkTask):
    """
    PySpark Task class to extract the hierarchical relations between terms for the ontologies: MPATH, MA, EMAPA and MP.
    The main goal of this task is to assign to any ontology term a list of:

    - direct children
    - direct parents
    - top level terms
    - intermediate terms (i.e. terms between the given term and the top level ones)
    - synonyms and definitions for all the related terms
    """

    #: Name of the Spark task
    name = "IMPC_Ontology_Term_Hierarchy_Extractor"

    #: Path to the directory containing OBO files for MA, MPATH, EMAPA and MP.
    obo_ontology_input_path: luigi.Parameter = luigi.Parameter()

    #: Path of the output directory where ethe new parquet file will be generated.
    output_path: luigi.Parameter = luigi.Parameter()

    #: List on ontologies to process with their corresponding top level terms
    ONTOLOGIES: List[Dict] = [
        {
            "id": "mpath",
            "format": "obo",
            "top_level_terms": []
        },
        {
            "id":
            "mp",
            "format":
            "obo",
            "top_level_terms": [
                "MP:0010768",
                "MP:0002873",
                "MP:0001186",
                "MP:0003631",
                "MP:0005367",
                "MP:0005369",
                "MP:0005370",
                "MP:0005371",
                "MP:0005377",
                "MP:0005378",
                "MP:0005375",
                "MP:0005376",
                "MP:0005379",
                "MP:0005380",
                "MP:0005381",
                "MP:0005384",
                "MP:0005385",
                "MP:0005382",
                "MP:0005388",
                "MP:0005389",
                "MP:0005386",
                "MP:0005387",
                "MP:0005391",
                "MP:0005390",
                "MP:0005394",
                "MP:0005397",
                "MP:0010771",
            ],
        },
        {
            "id":
            "ma",
            "format":
            "obo",
            "top_level_terms": [
                "MA:0000004",
                "MA:0000007",
                "MA:0000009",
                "MA:0000010",
                "MA:0000012",
                "MA:0000014",
                "MA:0000016",
                "MA:0000017",
                "MA:0000325",
                "MA:0000326",
                "MA:0000327",
                "MA:0002411",
                "MA:0002418",
                "MA:0002431",
                "MA:0002711",
                "MA:0002887",
                "MA:0002405",
            ],
        },
        {
            "id":
            "emapa",
            "format":
            "obo",
            "top_level_terms": [
                "EMAPA:16104",
                "EMAPA:16192",
                "EMAPA:16246",
                "EMAPA:16405",
                "EMAPA:16469",
                "EMAPA:16727",
                "EMAPA:16748",
                "EMAPA:16840",
                "EMAPA:17524",
                "EMAPA:31858",
            ],
        },
        # {"id": "efo", "top_level_terms": []},
        # {"id": "emap", "top_level_terms": []},
        # {"id": "pato", "top_level_terms": []},
    ]

    #: Schema of the resulting parquet file
    ONTOLOGY_SCHEMA: StructType = StructType([
        StructField("id", StringType(), True),
        StructField("term", StringType(), True),
        StructField("definition", StringType(), True),
        StructField("synonyms", ArrayType(StringType()), True),
        StructField("alt_ids", ArrayType(StringType()), True),
        StructField("child_ids", ArrayType(StringType()), True),
        StructField("child_terms", ArrayType(StringType()), True),
        StructField("child_definitions", ArrayType(StringType()), True),
        StructField("child_term_synonyms", ArrayType(StringType()), True),
        StructField("parent_ids", ArrayType(StringType()), True),
        StructField("parent_terms", ArrayType(StringType()), True),
        StructField("parent_definitions", ArrayType(StringType()), True),
        StructField("parent_term_synonyms", ArrayType(StringType()), True),
        StructField("intermediate_ids", ArrayType(StringType()), True),
        StructField("intermediate_terms", ArrayType(StringType()), True),
        StructField("intermediate_definitions", ArrayType(StringType()), True),
        StructField("intermediate_term_synonyms", ArrayType(StringType()),
                    True),
        StructField("top_level_ids", ArrayType(StringType()), True),
        StructField("top_level_terms", ArrayType(StringType()), True),
        StructField("top_level_definitions", ArrayType(StringType()), True),
        StructField("top_level_synonyms", ArrayType(StringType()), True),
        StructField("top_level_term_id", ArrayType(StringType()), True),
    ])

    def output(self):
        """
        Returns the full parquet path as an output for the Luigi Task
        (e.g. impc/dr15.2/parquet/impc_ontology_term_hierarchy_parquet)
        """
        return ImpcConfig().get_target(
            f"{self.output_path}impc_ontology_term_hierarchy_parquet")

    def app_options(self):
        """
        Generates the options pass to the PySpark job
        """
        return [
            self.obo_ontology_input_path,
            ImpcConfig().deploy_mode,
            self.output().path,
        ]

    def main(self, sc: SparkContext, *args: Any):
        """
        DCC Extractor job runner
        """
        input_path = args[0]
        deploy_mode = args[1]
        output_path = args[2]

        spark = SparkSession(sc)
        ontology_df = self.extract_ontology_terms(spark, input_path,
                                                  deploy_mode)
        ontology_df.write.mode("overwrite").parquet(output_path)

    def extract_ontology_terms(self, spark_session: SparkSession,
                               ontologies_path: str,
                               deploy_mode: str) -> DataFrame:
        """
        Takes in a spark session and the path containing cached OBO files and returns
        a DataFrame that represents Ontology terms hierarchical relationships.
        """

        # List of ontology terms
        ontology_terms = []

        # This process can only be performed on local or client mode
        for ontology_desc in self.ONTOLOGIES:
            print(
                f"Processing {ontology_desc['id']}.{ontology_desc['format']}")

            # Get the OBO file from the directory if MPATH otherwise get it from OBO foundry
            if ontology_desc["id"] == "mpath":
                if deploy_mode in ["local", "client"]:
                    ontology: Ontology = Ontology(ontologies_path +
                                                  "mpath.obo")
                else:
                    full_ontology_str = spark_session.sparkContext.wholeTextFiles(
                        ontologies_path + "mpath.obo").collect()[0][1]
                    ontology: Ontology = Ontology(
                        BytesIO(bytes(full_ontology_str, encoding="utf-8")))
            else:
                ontology: Ontology = pronto.Ontology.from_obo_library(
                    f"{ontology_desc['id']}.{ontology_desc['format']}")

            part_of_rel: Relationship = None

            # Find the part_of relationship on the current loaded ontology
            for rel in ontology.relationships():
                if rel.id == "part_of":
                    part_of_rel = rel
                    break

            # If a part_of relationship is found, compute the hierarchy of terms using it
            if part_of_rel is not None:
                part_of_rel.transitive = False
                print("Starting to compute super classes from part_of")
                for term in ontology.terms():
                    for super_part_term in term.objects(part_of_rel):
                        if super_part_term.id in ontology.keys():
                            term.superclasses().add(super_part_term)
                print("Finished to compute super classes from part_of")

            # Get the set of ancestors for the top level terms
            top_level_terms = [
                ontology[term] for term in ontology_desc["top_level_terms"]
            ]
            top_level_ancestors = []
            for top_level_term in top_level_terms:
                top_level_ancestors.extend(
                    top_level_term.superclasses(with_self=False))
            top_level_ancestors = set(top_level_ancestors)

            # Iterate over the ontology terms and to get the hierarchy between them and the top level terms
            ontology_terms += [
                self._parse_ontology_term(term, top_level_terms,
                                          top_level_ancestors, part_of_rel)
                for term in ontology.terms() if term.name is not None
            ]
            print(
                f"Finished processing {ontology_desc['id']}.{ontology_desc['format']}"
            )

        # Transform the list of dictionaries representing terms to JSON
        ontology_terms_json = spark_session.sparkContext.parallelize(
            ontology_terms)

        # Read the JSON RDD to a Spark DataFrame so it can be written to disk as Parquet
        ontology_terms_df = spark_session.read.json(
            ontology_terms_json, schema=self.ONTOLOGY_SCHEMA, mode="FAILFAST")
        return ontology_terms_df

    def _parse_ontology_term(
        self,
        ontology_term: Term,
        top_level_terms,
        top_level_ancestors,
        part_of_rel: Relationship,
    ) -> Dict:
        """
        Takes in an ontology term, a list of top level terms, a list of top level ancestors
        (i.e. the ancestors of the top level terms),
        the relationship used to convey hierarchy and returns a list of dictionaries with all the hierarchical
        relationships between the terms and the top level terms.
        """

        # Gather the direct children of the term
        children = [
            child_term
            for child_term in ontology_term.subclasses(1, with_self=False)
        ]

        # Get the direct parents of the term
        parents = [
            parent_term
            for parent_term in ontology_term.superclasses(1, with_self=False)
        ]

        # Get all the ancestors of the term
        ancestors = [
            ancestor_term
            for ancestor_term in ontology_term.superclasses(with_self=False)
        ]

        # Get all the ancestors based on the part_of relationship instead of relying on the is_a relationship
        if part_of_rel is not None:
            ancestors.extend([
                ancestor_term
                for ancestor_term in ontology_term.objects(part_of_rel)
            ])

        # Get the term top level terms by intersecting its ancestors with the ontology top level terms
        term_top_level_terms = set(top_level_terms).intersection(
            set(ancestors))

        # Determine the intermediate terms
        intermediate_terms = (set(ancestors).difference(
            set(top_level_terms)).difference(top_level_ancestors))
        # Builds and returns the term dictionary containing all the related terms information
        return {
            "id":
            ontology_term.id,
            "term":
            self._parse_text(ontology_term.name),
            "definition":
            self._parse_text(ontology_term.definition)
            if ontology_term.definition is not None else "null",
            "synonyms": [
                self._parse_text(synonym.description)
                for synonym in ontology_term.synonyms
                if synonym.type is "EXACT"
            ],
            "alt_ids":
            list(ontology_term.alternate_ids),
            "child_ids": [child_term.id for child_term in children],
            "child_terms":
            [self._parse_text(child_term.name) for child_term in children],
            "child_definitions": [
                self._parse_text(child_term.definition)
                for child_term in children if child_term.definition is not None
            ],
            "child_term_synonyms":
            self._get_synonym_list(children),
            "parent_ids": [parent_term.id for parent_term in parents],
            "parent_terms":
            [self._parse_text(parent_term.name) for parent_term in parents],
            "parent_definitions": [
                self._parse_text(parent_term.definition)
                for parent_term in parents
                if parent_term.definition is not None
            ],
            "parent_term_synonyms":
            self._get_synonym_list(parents),
            "intermediate_ids":
            [intermediate_term.id for intermediate_term in intermediate_terms],
            "intermediate_terms": [
                self._parse_text(intermediate_term.name)
                for intermediate_term in intermediate_terms
            ],
            "intermediate_definitions": [
                self._parse_text(intermediate_term.definition)
                for intermediate_term in intermediate_terms
                if intermediate_term.definition is not None
            ],
            "intermediate_term_synonyms":
            self._get_synonym_list(intermediate_terms),
            "top_level_ids": [
                term_top_level_term.id
                for term_top_level_term in term_top_level_terms
            ],
            "top_level_terms": [
                self._parse_text(term_top_level_term.name)
                for term_top_level_term in term_top_level_terms
            ],
            "top_level_definitions": [
                self._parse_text(term_top_level_term.definition)
                for term_top_level_term in term_top_level_terms
                if term_top_level_term.definition is not None
            ],
            "top_level_synonyms":
            self._get_synonym_list(term_top_level_terms),
            "top_level_term_id": [
                f"{term_top_level_term.id}___{self._parse_text(term_top_level_term.name)}"
                for term_top_level_term in term_top_level_terms
            ],
        }

    def _get_synonym_list(self, terms: Iterable[Term]):
        """
        Takes in a list of Terms and returns the list of synonyms for the given terms.
        """
        flat_list = []
        for term in terms:
            for synonym in term.synonyms:
                flat_list.append(self._parse_text(synonym.description))
        return flat_list

    def _parse_text(self, definition: bytes):
        """
        Parse an OBO definition text an return a valid Python str.
        """
        if definition is None:
            return None
        return unicodedata.normalize(
            "NFKD",
            definition.encode("iso-8859-1").decode("utf-8"))
Beispiel #13
0
def StreamingMr(rdd):

    if rdd.isEmpty():
        print("========================rdd is Empty========================")

    else:

        spark = SparkSession.builder.appName("spark_session").getOrCreate()

        # df_temp
        mylog_df = rdd.map(lambda line: line.split("|")).map(lambda line: [
            datetime.datetime.strptime(line[0].replace(
                '"', ''), "%Y-%m-%d %H:%M:%S"), line[1], line[2], line[3]
        ])
        fields = [
            StructField("access_time", TimestampType(), True),
            StructField("user_id", StringType(), True),
            StructField("session_id", StringType(), True),
            StructField("pre_post_politician_name", StringType(), True)
        ]
        schema = StructType(fields)
        df = spark.createDataFrame(mylog_df, schema)

        rdd = df.rdd

        mr = rdd.map(lambda line: (line[3], 1)).reduceByKey(lambda a, b: a + b)
        df_temp = spark.createDataFrame(mr)
        print("df_temp : ============================", type(df_temp))
        print("temp_show : ==========================", df_temp.show())

        df_temp = df_temp.withColumnRenamed("_1", "name")
        df_temp = df_temp.withColumnRenamed("_2", "temp_count")
        print("temp_show : ==========================", df_temp.show())

        # df_raw
        df_raw = spark.read.csv(hdfs_path + "/log_csv_res/*.csv", header=False)
        print("df_raw : ============================", type(df_raw))

        df_raw = df_raw.withColumnRenamed("_c0", "name")
        df_raw = df_raw.withColumnRenamed("_c1", "raw_count")

        # merge
        df_mig = df_raw.join(df_temp, df_raw.name == df_temp.name,
                             "left_outer").drop(df_temp.name)
        df_mig = df_mig.fillna(0)
        print("df_mig_join : =======================", df_mig.show())
        df_mig = df_mig.withColumn("count",
                                   df_mig.raw_count + df_mig.temp_count)
        print("df_mig : ============================", df_mig.show())
        df_mig = df_mig.select("name", "count")
        print("df_mig : ============================",
              df_mig.select("*").orderBy("count", ascending=False).show(5))

        print(
            "유승민 _df_mig : ============================",
            df_mig.select("*").where('name like "유승민(劉承旼)/%"').orderBy(
                "count", ascending=False).show(5))

        df_mig.coalesce(1).write.csv(path=hdfs_path + "/log_csv_res_temp",
                                     mode="overwrite")

        subprocess_open("$HADOOP_HOME/bin/hadoop fs -rm -r " + hdfs_path +
                        "/log_csv_res/*")
        subprocess_open("$HADOOP_HOME/bin/hadoop fs -mv " + hdfs_path +
                        "/log_csv_res_temp/* " + hdfs_path + "/log_csv_res/")
Beispiel #14
0
JsonDF = (spark.read.option("inferSchema", "true").json(
    JsonFilename, multiLine=True).withColumn('ImportDateTime', lit(nu)))
dfE = flatter(JsonDF.schema)
dfE = dfE.withColumn('import', lit('E'))
display(dfE)

# COMMAND ----------

dfT = dfE.union(dfED).union(dfTE)
display(dfT)

# COMMAND ----------

# generate a list of all the columns
ColumnList = flatten(JsonDF.schema)

print(ColumnList)

cschema = StructType([StructField("org", StringType())])
rdd = sc.parallelize(ColumnList)
df = sqlContext.createDataFrame(rdd, cschema)

display(df)

# COMMAND ----------

print(ColumnList)

# COMMAND ----------

rdd.foreach(print)
Beispiel #15
0
def create_effective_keywords(spark):
    data = [('travel', ), ('game-avg', ), ('education', )]
    schema = StructType([StructField("keyword", StringType(), True)])
    return spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
Beispiel #16
0
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructField, StructType

spark = SparkSession.builder.appName("Basic Read and Print").getOrCreate()

csv_schema = StructType(
    [StructField('id', IntegerType()),
     StructField('name', StringType())])

dataframe = spark.read.csv("read.csv", schema=csv_schema, header=True)

dataframe.show()
Beispiel #17
0
def create_cleaned_log(spark):
    data = [
        (
            'C000',
            '0000001',
            '1000',
            'splash',
            'abcdef0',
            'DUB-AL00',
            'WIFI',
            'CPC',
            '2020-01-01 12:34:56.78',
            'Huawei Magazine',
            0,
            0,
            'travel',
            '1',
            '2020-01-01',
            '1',
        ),
        (
            'C001',
            '0000002',
            '1000',
            'splash',
            'abcdef1',
            'DUB-AL00',
            'WIFI',
            'CPC',
            '2020-01-02 12:34:56.78',
            'Huawei Browser',
            1,
            0,
            'travel',
            '1',
            '2020-01-02',
            '1',
        ),
        (
            'C002',
            '0000003',
            '1001',
            'native',
            'abcdef2',
            'ABC-AL00',
            '4G',
            'CPD',
            '2020-01-03 12:34:56.78',
            'Huawei Video',
            0,
            1,
            'travel',
            '1',
            '2020-01-03',
            '1',
        ),
        (
            'C010',
            '0000004',
            '1001',
            'native',
            'abcdef3',
            'ABC-AL00',
            '4G',
            'CPD',
            '2020-01-04 12:34:56.78',
            'Huawei Music',
            1,
            1,
            'game-avg',
            '2',
            '2020-01-04',
            '1',
        ),
        (
            'C011',
            '0000005',
            '1002',
            'splash',
            'abcdef4',
            'DEF-AL00',
            'WIFI',
            'CPM',
            '2020-01-05 12:34:56.78',
            'Huawei Reading',
            0,
            2,
            'game-avg',
            '2',
            '2020-01-05',
            '1',
        ),
        (
            'C012',
            '0000006',
            '1002',
            'splash',
            'abcdef5',
            'DEF-AL00',
            'WIFI',
            'CPM',
            '2020-01-06 12:34:56.78',
            'Huawei Magazine',
            1,
            2,
            'game-avg',
            '2',
            '2020-01-06',
            '0',
        ),
        (
            'C020',
            '0000007',
            '1003',
            'splash',
            'abcdef6',
            'XYZ-AL00',
            '4G',
            'CPT',
            '2020-01-07 12:34:56.78',
            'Huawei Browser',
            0,
            3,
            'reading',
            '3',
            '2020-01-07',
            '0',
        ),
        (
            'C021',
            '0000008',
            '1003',
            'splash',
            'abcdef7',
            'XYZ-AL00',
            '4G',
            'CPT',
            '2020-01-08 12:34:56.78',
            'Huawei Video',
            1,
            3,
            'reading',
            '3',
            '2020-01-08',
            '0',
        ),
        (
            'C022',
            '0000009',
            '1004',
            'splash',
            'abcdef8',
            'TUV-AL00',
            'WIFI',
            'CPC',
            '2020-01-09 12:34:56.78',
            'Huawei Music',
            0,
            4,
            'reading',
            '3',
            '2020-01-09',
            '0',
        ),
        (
            'C023',
            '0000010',
            '1004',
            'splash',
            'abcdef9',
            'TUV-AL00',
            'WIFI',
            'CPC',
            '2020-01-10 12:34:56.78',
            'Huawei Reading',
            1,
            4,
            'reading',
            '3',
            '2020-01-10',
            '1',
        ),
    ]

    schema = StructType([
        StructField('spread_app_id', StringType(), True),
        StructField('did', StringType(), True),
        StructField('adv_id', StringType(), True),
        StructField('media', StringType(), True),
        StructField('slot_id', StringType(), True),
        StructField('device_name', StringType(), True),
        StructField('net_type', StringType(), True),
        StructField('price_model', StringType(), True),
        StructField('action_time', StringType(), True),
        StructField('media_category', StringType(), True),
        StructField('gender', IntegerType(), True),
        StructField('age', IntegerType(), True),
        StructField('keyword', StringType(), True),
        StructField('keyword_index', StringType(), True),
        StructField('day', StringType(), True),
        StructField('did_bucket', StringType(), True),
    ])

    return spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.functions import expr, col, desc

spark = (SparkSession.builder.master("local[*]").appName(
    "eg1_departuredelays").getOrCreate())

csv_file_path = "data/flight_data/csv/departuredelays.csv"

schema = StructType([
    StructField("date", TimestampType(), True),
    StructField("delay", IntegerType(), True),
    StructField("distance", IntegerType(), True),
    StructField("origin", StringType(), True),
    StructField("destination", StringType(), True)
])

df = spark.read.csv(path=csv_file_path,
                    schema=schema,
                    timestampFormat="MMddHHmm",
                    header=True)

df.createOrReplaceTempView("us_delay_flights_tbl")

df.printSchema()

df.show(truncate=False)

#df.where(expr("distance > 1000")).select("origin","destination","distance").orderBy(desc("distance")).show(5)

spark.sql("""
Beispiel #19
0
def create_trainready_data(spark):
    data = [
        (
            0,
            0,
            '0000001',
            1000000001,
            [u'1578009600', u'1577923200', u'1577836800'],
            [u'travel', u'travel', u'travel,game-avg'],
            [u'1', u'1', u'1,2'],
            [u'1:2', u'1:2', u'1:2,2:1'],
            [u'1:1', u'1:1', u'1:1,2:0'],
            '1',
        ),
        (
            0,
            1,
            '0000002',
            1000000002,
            [u'1577923200'],
            [u'travel'],
            [u'1'],
            [u'1:2'],
            [u'1:1'],
            '1',
        ),
        (
            1,
            0,
            '0000003',
            1000000003,
            [u'1578009600'],
            [u'travel'],
            [u'1'],
            [u'1:2'],
            [u'1:1'],
            '1',
        ),
        (
            1,
            1,
            '0000004',
            1000000004,
            [u'1578096000'],
            [u'game-avg'],
            [u'2'],
            [u'2:2'],
            [u'2:1'],
            '1',
        ),
        (
            2,
            0,
            '0000005',
            1000000005,
            [u'1578182400'],
            [u'game-avg'],
            [u'2'],
            [u'2:2'],
            [u'2:1'],
            '1',
        ),
        (
            2,
            1,
            '0000006',
            1,
            [u'1578268800'],
            [u'game-avg'],
            [u'2'],
            [u'2:2'],
            [u'2:1'],
            '0',
        ),
        (
            3,
            0,
            '0000007',
            2,
            [u'1578355200'],
            [u'reading'],
            [u'3'],
            [u'3:2'],
            [u'3:1'],
            '0',
        ),
        (
            3,
            1,
            '0000008',
            3,
            [u'1578441600'],
            [u'reading'],
            [u'3'],
            [u'3:2'],
            [u'3:1'],
            '0',
        ),
        (
            4,
            0,
            '0000009',
            4,
            [u'1578528000'],
            [u'reading'],
            [u'3'],
            [u'3:2'],
            [u'3:1'],
            '0',
        ),
        (
            4,
            1,
            '0000010',
            1000000006,
            [u'1578614400'],
            [u'reading'],
            [u'3'],
            [u'3:2'],
            [u'3:1'],
            '1',
        ),
    ]

    schema = StructType([
        StructField('age', IntegerType(), True),
        StructField('gender', IntegerType(), True),
        StructField('did', StringType(), True),
        StructField('did_index', LongType(), True),
        StructField('interval_starting_time', ArrayType(StringType(), True),
                    True),
        StructField('interval_keywords', ArrayType(StringType(), True), True),
        StructField('kwi', ArrayType(StringType(), True), True),
        StructField('kwi_show_counts', ArrayType(StringType(), True), True),
        StructField('kwi_click_counts', ArrayType(StringType(), True), True),
        StructField('did_bucket', StringType(), True),
    ])

    return spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
Beispiel #20
0
    def register(self, name, f, returnType=None):
        """Register a Python function (including lambda function) or a user-defined function
        as a SQL function.

        :param name: name of the user-defined function in SQL statements.
        :param f: a Python function, or a user-defined function. The user-defined function can
            be either row-at-a-time or vectorized. See :meth:`pyspark.sql.functions.udf` and
            :meth:`pyspark.sql.functions.pandas_udf`.
        :param returnType: the return type of the registered user-defined function. The value can
            be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
        :return: a user-defined function.

        To register a nondeterministic Python function, users need to first build
        a nondeterministic user-defined function for the Python function and then register it
        as a SQL function.

        `returnType` can be optionally specified when `f` is a Python function but not
        when `f` is a user-defined function. Please see below.

        1. When `f` is a Python function:

            `returnType` defaults to string type and can be optionally specified. The produced
            object must match the specified type. In this case, this API works as if
            `register(name, f, returnType=StringType())`.

            >>> strlen = spark.udf.register("stringLengthString", lambda x: len(x))
            >>> spark.sql("SELECT stringLengthString('test')").collect()
            [Row(stringLengthString(test)=u'4')]

            >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
            [Row(stringLengthString(text)=u'3')]

            >>> from pyspark.sql.types import IntegerType
            >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
            >>> spark.sql("SELECT stringLengthInt('test')").collect()
            [Row(stringLengthInt(test)=4)]

            >>> from pyspark.sql.types import IntegerType
            >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
            >>> spark.sql("SELECT stringLengthInt('test')").collect()
            [Row(stringLengthInt(test)=4)]

        2. When `f` is a user-defined function:

            Spark uses the return type of the given user-defined function as the return type of
            the registered user-defined function. `returnType` should not be specified.
            In this case, this API works as if `register(name, f)`.

            >>> from pyspark.sql.types import IntegerType
            >>> from pyspark.sql.functions import udf
            >>> slen = udf(lambda s: len(s), IntegerType())
            >>> _ = spark.udf.register("slen", slen)
            >>> spark.sql("SELECT slen('test')").collect()
            [Row(slen(test)=4)]

            >>> import random
            >>> from pyspark.sql.functions import udf
            >>> from pyspark.sql.types import IntegerType
            >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
            >>> new_random_udf = spark.udf.register("random_udf", random_udf)
            >>> spark.sql("SELECT random_udf()").collect()  # doctest: +SKIP
            [Row(random_udf()=82)]

            >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
            >>> @pandas_udf("integer", PandasUDFType.SCALAR)  # doctest: +SKIP
            ... def add_one(x):
            ...     return x + 1
            ...
            >>> _ = spark.udf.register("add_one", add_one)  # doctest: +SKIP
            >>> spark.sql("SELECT add_one(id) FROM range(3)").collect()  # doctest: +SKIP
            [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)]

            .. note:: Registration for a user-defined function (case 2.) was added from
                Spark 2.3.0.
        """

        # This is to check whether the input function is from a user-defined function or
        # Python function.
        if hasattr(f, 'asNondeterministic'):
            if returnType is not None:
                raise TypeError(
                    "Invalid returnType: data type can not be specified when f is"
                    "a user-defined function, but got %s." % returnType)
            if f.evalType not in [
                    PythonEvalType.SQL_BATCHED_UDF,
                    PythonEvalType.SQL_SCALAR_PANDAS_UDF
            ]:
                raise ValueError(
                    "Invalid f: f must be either SQL_BATCHED_UDF or SQL_SCALAR_PANDAS_UDF"
                )
            register_udf = UserDefinedFunction(f.func,
                                               returnType=f.returnType,
                                               name=name,
                                               evalType=f.evalType,
                                               deterministic=f.deterministic)
            return_udf = f
        else:
            if returnType is None:
                returnType = StringType()
            register_udf = UserDefinedFunction(
                f,
                returnType=returnType,
                name=name,
                evalType=PythonEvalType.SQL_BATCHED_UDF)
            return_udf = register_udf._wrapped()
        self.sparkSession._jsparkSession.udf().registerPython(
            name, register_udf._judf)
        return return_udf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType, DateType
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import pandas as pd

schema = StructType([
    StructField("Region", StringType()),
    StructField("Pais", StringType()),
    StructField("Departamento", StringType()),
    StructField("Canal Venta", StringType()),
    StructField("Prioridad Orden", StringType()),
    StructField("Fecha Orden", StringType()),
    StructField("Id Orden", StringType()),
    StructField("Fecha Envio", StringType()),
    StructField("Unidades Vendidas", IntegerType()),
    StructField("Precio Unidad", DoubleType()),
    StructField("Costo Unidad", DoubleType()),
    StructField("Ingreso Total", DoubleType()),
    StructField("Costo Total", DoubleType()),
    StructField("Ganancia Total", DoubleType()),
])


spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
Beispiel #22
0
def index2term(vocabulary):
    return udf(lambda col: term_list(col, vocabulary), ArrayType(StringType()))
Beispiel #23
0
class ExtractLinksJob(CCSparkJob):
    '''Extract links from WAT files and redirects from WARC files'''
    ''' and save them as pairs <from, to>'''
    name = "ExtractLinks"

    output_schema = StructType([
        StructField("s", StringType(), True),
        StructField("t", StringType(), True)
    ])

    warc_parse_http_header = False

    records_response = None
    records_response_wat = None
    records_response_warc = None
    records_failed = None
    records_non_html = None
    records_response_redirect = None

    http_redirect_pattern = re.compile(b'^HTTP\s*/\s*1\.[01]\s*30[1278]\\b')
    http_redirect_location_pattern = re.compile(b'^Location:\s*(\S+)',
                                                re.IGNORECASE)

    def add_arguments(self, parser):
        parser.add_argument("--intermediate_output",
                            type=str,
                            default=None,
                            help="Intermediate output to recover job from")

    @staticmethod
    def _url_join(base, link):
        # TODO: efficiently join without reparsing base
        # TODO: canonicalize
        pass

    def process_record(self, record):
        if self.is_wat_json_record(record):
            try:
                record = json.loads(record.content_stream().read())
            except ValueError as e:
                self.get_logger().error('Failed to load JSON: {}'.format(e))
                self.records_failed.add(1)
                return
            warc_header = record['Envelope']['WARC-Header-Metadata']
            if warc_header['WARC-Type'] != 'response':
                # WAT request or metadata records
                return
            self.records_response.add(1)
            self.records_response_wat.add(1)
            url = warc_header['WARC-Target-URI']
            for link in self.get_links(url, record):
                yield link
        elif record.rec_type == 'response':
            self.records_response.add(1)
            self.records_response_warc.add(1)
            stream = record.content_stream()
            http_status_line = stream.readline()
            if ExtractLinksJob.http_redirect_pattern.match(http_status_line):
                self.records_response_redirect.add(1)
            else:
                return
            line = stream.readline()
            while line:
                m = ExtractLinksJob.http_redirect_location_pattern.match(line)
                if m:
                    redir_to = m.group(1).strip()
                    try:
                        redir_to = redir_to.decode('utf-8')
                    except UnicodeError as e:
                        self.get_logger().warn(
                            'URL with unknown encoding: {} - {}'.format(
                                redir_to, e))
                    redir_from = record.rec_headers.get_header(
                        'WARC-Target-URI')
                    for link in self.yield_redirect(redir_from, redir_to,
                                                    http_status_line):
                        yield link
                    return
                elif line.strip() == '':
                    return
                line = stream.readline()

    def yield_redirect(self, src, target, http_status_line):
        if src != target:
            yield src, target

    def yield_links(self, from_url, base_url, links, url_attr='url'):
        # base_url = urlparse(base)
        if base_url is None:
            base_url = from_url
        for l in links:
            if url_attr in l:
                link = l[url_attr]
                # lurl = _url_join(base_url, urlparse(link)).geturl()
                try:
                    lurl = urljoin(base_url, link)
                except ValueError:
                    pass
                yield from_url, lurl

    def get_links(self, url, record):
        try:
            response_meta = record['Envelope']['Payload-Metadata'][
                'HTTP-Response-Metadata']
            if 'HTML-Metadata' not in response_meta:
                self.records_non_html.add(1)
                return
            html_meta = response_meta['HTML-Metadata']
            base = None
            if 'Head' in html_meta:
                head = html_meta['Head']
                if 'Base' in head:
                    try:
                        base = urljoin(url, head['Base'])
                    except ValueError:
                        pass
                if 'Link' in head:
                    # <link ...>
                    for l in self.yield_links(url, base, head['Link']):
                        yield l
                if 'Metas' in head:
                    for m in head['Metas']:
                        if 'property' in m and m['property'] == 'og:url':
                            for l in self.yield_links(url, base, [m],
                                                      'content'):
                                yield l
            if 'Links' in html_meta:
                for l in self.yield_links(url, base, html_meta['Links']):
                    yield l
        except KeyError as e:
            self.get_logger().error("Failed to parse record for {}: {}".format(
                url, e))
            self.records_failed.add(1)

    def init_accumulators(self, sc):
        super(ExtractLinksJob, self).init_accumulators(sc)

        self.records_failed = sc.accumulator(0)
        self.records_non_html = sc.accumulator(0)
        self.records_response = sc.accumulator(0)
        self.records_response_wat = sc.accumulator(0)
        self.records_response_warc = sc.accumulator(0)
        self.records_response_redirect = sc.accumulator(0)

    def log_aggregators(self, sc):
        super(ExtractLinksJob, self).log_aggregators(sc)

        self.log_aggregator(sc, self.records_response, 'response records = {}')
        self.log_aggregator(sc, self.records_failed,
                            'records failed to process = {}')
        self.log_aggregator(sc, self.records_non_html, 'records not HTML = {}')
        self.log_aggregator(sc, self.records_response_wat,
                            'response records WAT = {}')
        self.log_aggregator(sc, self.records_response_warc,
                            'response records WARC = {}')
        self.log_aggregator(sc, self.records_response_redirect,
                            'response records redirects = {}')

    def run_job(self, sc, sqlc):
        output = None
        if self.args.input != '':
            input_data = sc.textFile(
                self.args.input, minPartitions=self.args.num_input_partitions)
            output = input_data.mapPartitionsWithIndex(self.process_warcs)

        if self.args.intermediate_output is None:
            df = sqlc.createDataFrame(output, schema=self.output_schema)
        else:
            if output is not None:
                sqlc.createDataFrame(output, schema=self.output_schema) \
                    .write \
                    .format(self.args.output_format) \
                    .option("compression", self.args.output_compression) \
                    .saveAsTable(self.args.intermediate_output)
                self.log_aggregators(sc)
            warehouse_dir = sc.getConf().get('spark.sql.warehouse.dir',
                                             'spark-warehouse')
            intermediate_output = os.path.join(warehouse_dir,
                                               self.args.intermediate_output)
            df = sqlc.read.parquet(intermediate_output)

        df.dropDuplicates() \
          .coalesce(self.args.num_output_partitions) \
          .sortWithinPartitions('s', 't') \
          .write \
          .format(self.args.output_format) \
          .option("compression", self.args.output_compression) \
          .saveAsTable(self.args.output)

        self.log_aggregators(sc)
Beispiel #24
0
class ExtractKeywordJob(CCSparkJob):
    """ Extract keywords from title in Common Crawl WAT files """

    name = "ExtractKeyword"

    output_schema = StructType([
        StructField("url", StringType(), True),
        StructField("keywords", StringType(), True),
        StructField("title", StringType(), True),
        StructField("description", StringType(), True),
    ])

    def process_record(self, record):
        ''' returns list of keywords given WAT file'''
        if self.is_wat_json_record(record):
            record = json.loads(record.content_stream().read())
            url = self.get_url(record)
            title = self.get_title(record)
            links = self.get_links(record)

            if title and self.is_english(title) and not self.has_ads(links):
                descrip = self.get_description(record)

                title_words = [
                 word for word in title.lower().split() \
                 if not word.isdigit() and
                 not self.is_stopword(word) and
                 word.isalnum()
                ]
                for word in title_words:
                    yield url, word, title, descrip

    def run_job(self, sc, sqlc):
        # convert .gz input file to RDD of strings
        input_data = sc.textFile(self.args.input,
                                 minPartitions=self.args.num_input_partitions)

        # map func process_warcs across partition while keeping index
        output = input_data.mapPartitionsWithIndex(self.process_warcs)

        # create SQL DF from output RDD
        sqlc.createDataFrame(output, schema=self.output_schema) \
            .coalesce(self.args.num_output_partitions) \
            .write \
            .format("parquet") \
            .saveAsTable(self.args.output)

        self.get_logger(sc).info('records processed = {}'.format(
            self.records_processed.value))

    def is_wat_json_record(self, record):
        ''' Return true if WARC record is a WAT record'''
        return (record.rec_type == 'metadata'
                and record.content_type == 'application/json')

    def get_url(self, record):
        # not using safe access .get on dict since so many key-value pairs could be missing
        try:
            return record['Envelope']['WARC-Header-Metadata'][
                'WARC-Target-URI']
        except KeyError, e:  # missing in metadata
            return ''
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    # job = Job(glueContext)
    # job.init(args['JOB_NAME'], args)
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")

    is_dev = True

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_second = long(today.strftime("%s"))
    print('today_id: ', today_second)

    #------------------------------------------------------------------------------------------------------------------#

    # ------------------------------------------------------------------------------------------------------------------#

    def concaText(student_behavior_date, behavior_id, student_id, contact_id,
                  package_code, package_endtime, package_starttime,
                  student_level_code, student_status_code, transformed_at):
        text_concat = ""
        if student_behavior_date is not None:
            text_concat += str(student_behavior_date)
        if behavior_id is not None:
            text_concat += str(behavior_id)
        if student_id is not None:
            text_concat += str(student_id)
        if contact_id is not None:
            text_concat += str(contact_id)
        if package_code is not None:
            text_concat += str(package_code)
        if package_endtime is not None:
            text_concat += str(package_endtime)
        if package_starttime is not None:
            text_concat += str(package_starttime)
        if student_level_code is not None:
            text_concat += str(student_level_code)
        if student_status_code is not None:
            text_concat += str(student_status_code)
        if transformed_at is not None:
            text_concat += str(transformed_at)
        return text_concat

    concaText = f.udf(concaText, StringType())

    def convertStudentIdToLong(student_id):
        try:
            student_id_long = long(student_id)
            return student_id_long
        except:
            return 0L

    convertStudentIdToLong = f.udf(convertStudentIdToLong, LongType())
    # ------------------------------------------------------------------------------------------------------------------#

    ##################################################
    # Lay du lieu kiem tra ky thuat trong bang student_technical_test
    dyf_datasourceTech = glueContext.create_dynamic_frame.from_catalog(
        database="dm_toa", table_name="student_technical_test_odin")

    # print('dyf_datasourceTech')
    # dyf_datasourceTech.printSchema()

    # Chon cac truong can thiet
    dyf_datasourceTech = dyf_datasourceTech.select_fields([
        '_key', 'thoigianhenktkt', 'ketluan', 'emailhocvien', 'dauthoigian',
        'emailadvisor', 'nguoitiepnhan', 'trinhdohocvien'
    ])

    dyf_datasourceTech = dyf_datasourceTech.resolveChoice(
        specs=[('_key', 'cast:long')])

    if (dyf_datasourceTech.count() > 0):
        dyf_datasourceTech = Filter.apply(
            frame=dyf_datasourceTech,
            f=lambda x: x["emailhocvien"] is not None and x["emailhocvien"] !=
            '' and x["thoigianhenktkt"] is not None and x[
                "thoigianhenktkt"] != '' and x["ketluan"] == 'Pass')

        dyf_datasourceTech_numeber = dyf_datasourceTech.count()
        print("Count data 2:  ", dyf_datasourceTech_numeber)

        if dyf_datasourceTech_numeber < 1:
            return

        dy_datasourceTech = dyf_datasourceTech.toDF()

        dy_datasourceTech = dy_datasourceTech.limit(100)

        print('dy_datasourceTech')
        dy_datasourceTech.printSchema()
        dy_datasourceTech = dy_datasourceTech.withColumn(
            'thoigianhenktkt_id',
            f.unix_timestamp('thoigianhenktkt',
                             'yyyy-MM-dd HH:mm:ss').cast('long'))

        print('dy_datasourceTech__2')
        dy_datasourceTech.printSchema()

        # lay thoi gian kich hoat dau tien
        w2 = Window.partitionBy("emailhocvien").orderBy(
            f.col("thoigianhenktkt_id").desc())
        dy_datasourceTech = dy_datasourceTech.withColumn("row", f.row_number().over(w2)) \
            .where(f.col('row') <= 1) \

        print('dy_datasourceTech__3')
        dy_datasourceTech.printSchema()

        #--------------------------------------------------------------------------------------------------------------#
        dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
            database="tig_advisor", table_name="student_contact")

        # chon cac field
        dyf_student_contact = dyf_student_contact.select_fields(
            ['_key', 'contact_id', 'student_id', 'user_name'])

        dyf_student_contact = Filter.apply(
            frame=dyf_student_contact,
            f=lambda x: x["contact_id"] is not None and x["contact_id"] != ''
            and x["student_id"] is not None and x["student_id"] != '' and x[
                "user_name"] is not None and x["user_name"] != '')

        dyf_student_contact_number = dyf_student_contact.count()
        print('dyf_student_contact_number::number: ',
              dyf_student_contact_number)
        if dyf_student_contact_number < 1:
            return

        dy_student_contact = dyf_student_contact.toDF()
        dy_student_contact.dropDuplicates(['student_id'])

        dy_join_teach_concat = dy_datasourceTech.join(
            dy_student_contact,
            dy_datasourceTech.emailhocvien == dy_student_contact.user_name)

        print('dyf_join_teach_concat::schema')
        dy_join_teach_concat.printSchema()

        join_teach_concat_number = dy_join_teach_concat.count()
        print('join_teach_concat_number::number: ', join_teach_concat_number)
        if join_teach_concat_number < 1:
            return

        #--------------------------------------------------------------------------------------------------------------#

        dyf_student_package_status = glueContext.create_dynamic_frame.from_catalog(
            database="od_student_behavior", table_name="student_status")

        dyf_student_package_status = dyf_student_package_status\
            .select_fields(['contact_id', 'status_code', 'start_date', 'end_date'])\
            .rename_field('contact_id', 'contact_id_ps')

        print('dyf_student_package_status::drop_duplicates')

        df_student_package_status = dyf_student_package_status.toDF()
        print('dyf_student_package_status::drop_duplicates::before: ',
              df_student_package_status.count())
        df_student_package_status = df_student_package_status.drop_duplicates()
        print('dyf_student_package_status::drop_duplicates::after: ',
              df_student_package_status.count())

        print('dy_student_package_status')
        df_student_package_status.printSchema()
        # --------------------------------------------------------------------------------------------------------------#
        dyf_student_package = glueContext.create_dynamic_frame.from_catalog(
            database="od_student_behavior", table_name="student_package")

        print('dyf_student_package__0')
        dyf_student_package.printSchema()


        dyf_student_package = dyf_student_package \
            .select_fields(['student_id', 'package_code', 'start_time', 'end_time'])\
            .rename_field('student_id', 'student_id_pk')

        # --------------------------------------------------------------------------------------------------------------#

        print('dyf_student_package__1')
        dyf_student_package.printSchema()

        dyf_student_package = dyf_student_package.resolveChoice(
            specs=[('start_time', 'cast:long'), ('end_time', 'cast:long')])

        print('dyf_student_package__2')
        dyf_student_package.printSchema()

        df_student_package = dyf_student_package.toDF()
        print('df_student_package::drop_duplicates::before: ',
              df_student_package.count())
        df_student_package = df_student_package.drop_duplicates()
        print('df_student_package::drop_duplicates::after: ',
              df_student_package.count())

        print('df_student_package')
        df_student_package.printSchema()
        df_student_package.show(3)

        df_student_package_number = df_student_package.count()
        print('df_student_package_number: ', df_student_package_number)

        # --------------------------------------------------------------------------------------------------------------#

        # --------------------------------------------------------------------------------------------------------------#

        dy_join_teach_concat_number = dy_join_teach_concat.count()
        print('dy_join_teach_concat_number: ', dy_join_teach_concat_number)

        join_result = dy_join_teach_concat\
            .join(df_student_package_status,
                 (dy_join_teach_concat.contact_id == df_student_package_status.contact_id_ps)
                 & (dy_join_teach_concat.thoigianhenktkt_id >= df_student_package_status.start_date)
                 & (dy_join_teach_concat.thoigianhenktkt_id < df_student_package_status.end_date),
                 'left'
                  )\
            .join(df_student_package,
                  (dy_join_teach_concat.student_id == df_student_package.student_id_pk)
                  &(dy_join_teach_concat.thoigianhenktkt_id >= df_student_package.start_time)
                  &(dy_join_teach_concat.thoigianhenktkt_id < df_student_package.end_time),
                  'left'
                  )

        print('join_result')
        join_result.printSchema()

        join_result_number = join_result.count()
        print('join_result_number: ', join_result_number)
        if join_result_number < 1:
            return
        join_result.show(3)

        student_id_unavailable = 0L
        package_endtime_unavailable = 99999999999L
        package_starttime_unavailable = 0L
        package_code_unavailable = 'UNAVAILABLE'
        student_level_code_unavailable = 'UNAVAILABLE'
        student_status_code_unavailable = 'UNAVAILABLE'
        measure1_unavailable = 0
        measure2_unavailable = 0
        measure3_unavailable = 0
        measure4_unavailable = float(0.0)

        # join_result = join_result.withColumnRenamed('student_id', 'student_id_a')

        join_result = join_result.select(
            join_result.thoigianhenktkt_id.alias('student_behavior_date'),
            f.lit(5L).alias('behavior_id'),
            'student_id',
            join_result.contact_id.alias('contact_id'),
            join_result.package_code.alias('package_code'),
            join_result.end_time.cast('long').alias('package_endtime'),
            join_result.start_time.cast('long').alias('package_starttime'),
            join_result.trinhdohocvien.cast('string').alias(
                'student_level_code'),
            join_result.status_code.cast('string').alias(
                'student_status_code'),
            f.lit(today_second).cast('long').alias('transformed_at'),
        )

        join_result = join_result.na.fill({
            'package_code':
            package_code_unavailable,
            'package_endtime':
            package_starttime_unavailable,
            'package_starttime':
            package_endtime_unavailable,
            'student_level_code':
            student_level_code_unavailable,
            'student_status_code':
            student_status_code_unavailable
        })

        print('join_result--1')
        join_result.printSchema()
        join_result.show(1)

        join_result = join_result.withColumn(
            'student_behavior_id',
            f.md5(
                concaText(join_result.student_behavior_date,
                          join_result.behavior_id, join_result.student_id,
                          join_result.contact_id, join_result.package_code,
                          join_result.package_endtime,
                          join_result.package_starttime,
                          join_result.student_level_code,
                          join_result.student_status_code,
                          join_result.transformed_at)))
        #
        print('join_result--2')
        join_result.printSchema()
        join_result.show(5)

        dyf_join_result = DynamicFrame.fromDF(join_result, glueContext,
                                              'dyf_join_result')

        dyf_join_result = Filter.apply(
            frame=dyf_join_result,
            f=lambda x: x["contact_id"] is not None and x["contact_id"] != '')

        apply_ouput = ApplyMapping.apply(
            frame=dyf_join_result,
            mappings=[("student_behavior_id", "string", "student_behavior_id",
                       "string"),
                      ("student_behavior_date", "long",
                       "student_behavior_date", "long"),
                      ("behavior_id", "long", "behavior_id", "long"),
                      ("student_id", "string", "student_id", "long"),
                      ("contact_id", "string", "contact_id", "string"),
                      ("package_code", "long", "package_code", "string"),
                      ("package_endtime", "long", "package_endtime", "long"),
                      ("package_starttime", "long", "package_starttime",
                       "long"),
                      ("student_level_code", "string", "student_level_code",
                       "string"),
                      ("student_status_code", "string", "student_status_code",
                       "string"),
                      ("transformed_at", "long", "transformed_at", "long")])

        dfy_output = ResolveChoice.apply(frame=apply_ouput,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")
        #
        glueContext.write_dynamic_frame.from_options(
            frame=dfy_output,
            connection_type="s3",
            connection_options={
                "path": "s3://dtsodin/student_behavior/student_behavior",
                "partitionKeys": ["behavior_id"]
            },
            format="parquet")
Beispiel #26
0
from libs.parse import parse
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType

TITLE_COLUMN_TEMPLATE = "{title} ({year})"


def _get_movie_title(title_column):
    parsed = parse(TITLE_COLUMN_TEMPLATE, title_column)
    return parsed.named.get("title") if parsed else None


def _get_movie_year(title_column):
    parsed = parse(TITLE_COLUMN_TEMPLATE, title_column)
    year = parsed.named.get("year") if parsed else None
    return int(year) if year and year.isdigit() else None

get_movie_title_udf=udf(lambda z: _get_movie_title(z),StringType())
get_movie_year_udf=udf(lambda z: _get_movie_year(z), IntegerType())
Beispiel #27
0
def getPBISchema(file):
    validationSchema = StructType([
        StructField("ticket_id", StringType(), True),
        StructField("portal_ticket_id", StringType(), True),
        StructField("sigma_ticket_id", StringType(), True),
        StructField("bmc_ticket_submitter", StringType(), True),
        StructField("bmc_ticket_submit_date", StringType(), True),
        StructField("bmc_ticket_last_modified_by", StringType(), True),
        StructField("bmc_ticket_last_modification_date", StringType(), True),
        StructField("status_id", StringType(), True),
        StructField("substatus_id", StringType(), True),
        StructField("bmc_ticket_assigned_support_group_id", StringType(),
                    True),
        StructField("impact_id", StringType(), True),
        StructField("urgency_id", StringType(), True),
        StructField("priority_id", StringType(), True),
        StructField("portal_ticket_summary", StringType(), True),
        StructField("portal_ticket_notes", StringType(), True),
        StructField("bmc_ticket_contact_first_name", StringType(), True),
        StructField("bmc_ticket_contact_email", StringType(), True),
        StructField("portal_end_user_site_location", StringType(), True),
        StructField("bmc_ticket_required_resolution_date", StringType(), True),
        StructField("bmc_ticket_father_id", StringType(), True),
        StructField("bmc_ticket_last_resolved_date", StringType(), True),
        StructField("bmc_ticket_closed_date", StringType(), True),
        StructField("bmc_ticket_responded_date", StringType(), True),
        StructField("bmc_model_version_id", StringType(), True),
        StructField("bmc_product_id", StringType(), True),
        StructField("bmc_manufacturer_id", StringType(), True),
        StructField("portal_ticket_planned_start_date", StringType(), True),
        StructField("admin_number", StringType(), True),
        StructField("sigma_additional_information", StringType(), True),
        StructField("sigma_ticket_description", StringType(), True),
        StructField("product_categorization_tier_1", StringType(), True),
        StructField("product_categorization_tier_2", StringType(), True),
        StructField("product_categorization_tier_3", StringType(), True),
        StructField("operational_categorization_tier_1", StringType(), True),
        StructField("operational_categorization_tier_2", StringType(), True),
        StructField("operational_categorization_tier_3", StringType(), True),
        StructField("resolution_category_tier_1", StringType(), True),
        StructField("resolution_category_tier_2", StringType(), True),
        StructField("resolution_category_tier_3", StringType(), True),
        StructField("bmc_ticket_summary", StringType(), True),
        StructField("bmc_ticket_owner_support_group_id", StringType(), True),
        StructField("csp_lite_request_description", StringType(), True),
        StructField("csp_lite_element_summary", StringType(), True),
        StructField("csp_lite_additional_comments", StringType(), True),
        StructField("csp_express_ci_id", StringType(), True),
        StructField("ci_name", StringType(), True),
        StructField("admin_number_1", StringType(), True),
        StructField("ci_country", StringType(), True),
        StructField("city", StringType(), True),
        StructField("customer", StringType(), True),
        StructField("ci_id_fast", StringType(), True),
        StructField("admin_number_2", StringType(), True),
        StructField("instanceid", StringType(), True),
        StructField("coordinator_group", StringType(), True),
        StructField("problem_coordinator", StringType(), True),
        StructField("assigned_group", StringType(), True),
        StructField("assignee", StringType(), True),
        StructField("_corrupt_record", StringType(), True)
    ])
    return validationSchema
Beispiel #28
0
def create_keywords_raw_log(spark):
    data = [
        ('0000001', '1000', 'splash', 'abcdef0', 'C000', 'DUB-AL00', 'WIFI',
         'CPC', '2020-01-01 12:34:56.78'),  # travel
        ('0000002', '1000', 'splash', 'abcdef1', 'C001', 'DUB-AL00', 'WIFI',
         'CPC', '2020-01-02 12:34:56.78'),  # travel
        ('0000003', '1001', 'native', 'abcdef2', 'C002', 'ABC-AL00', '4G',
         'CPD', '2020-01-03 12:34:56.78'),  # travel
        ('0000005', '1001', 'native', 'abcdef2', 'C002', 'ABC-AL00', '4G',
         'CPD', '2020-01-03 12:34:56.78'),  # travel
        ('0000006', '1001', 'native', 'abcdef2', 'C002', 'ABC-AL00', '4G',
         'CPD', '2020-01-03 12:34:56.78'),  # travel
        ('0000007', '1001', 'native', 'abcdef2', 'C002', 'ABC-AL00', '4G',
         'CPD', '2020-01-03 12:34:56.78'),  # travel
        ('0000008', '1001', 'native', 'abcdef2', 'C003', 'ABC-AL00', '4G',
         'CPD', '2020-01-03 12:34:56.78'),  # travel
        ('0000009', '1001', 'native', 'abcdef2', 'C003', 'ABC-AL00', '4G',
         'CPD', '2020-01-03 12:34:56.78'),  # travel
        ('0000010', '1001', 'native', 'abcdef2', 'C003', 'ABC-AL00', '4G',
         'CPD', '2020-01-03 12:34:56.78'),  # travel
        ('0000011', '1001', 'native', 'abcdef2', 'C004', 'ABC-AL00', '4G',
         'CPD', '2020-01-03 12:34:56.78'),  # travel
        ('0000012', '1001', 'native', 'abcdef2', 'C004', 'ABC-AL00', '4G',
         'CPD', '2020-01-03 12:34:56.78'),  # travel
        ('0000013', '1001', 'native', 'abcdef2', 'C004', 'ABC-AL00', '4G',
         'CPD', '2020-01-03 12:34:56.78'),  # travel
        ('0000014', '1001', 'native', 'abcdef2', 'C010', 'ABC-AL00', '4G',
         'CPD', '2020-01-03 12:34:56.78'),  # game-avg
        ('0000004', '1001', 'native', 'abcdef3', 'C010', 'ABC-AL00', '4G',
         'CPD', '2020-01-04 12:34:56.78'),  # game-avg
        ('0000005', '1001', 'native', 'abcdef3', 'C010', 'ABC-AL00', '4G',
         'CPD', '2020-01-05 12:34:56.78'),  # game-avg
        ('0000007', '1003', 'splash', 'abcdef6', 'C020', 'XYZ-AL00', '4G',
         'CPT', '2020-01-07 12:34:56.78'
         ),  # reading; only one entry for this keyword so will be excluded.
        ('0000008', '1003', 'splash', 'abcdef6', 'C030', 'XYZ-AL00', '4G',
         'CPT', '2020-01-08 12:34:56.78'
         ),  # shopping; only one entry for this keyword so will be excluded.
        ('0000009', '1003', 'splash', 'abcdef6', 'C040', 'XYZ-AL00', '4G',
         'CPT', '2020-01-09 12:34:56.78'
         ),  # education; just enough entries to be included.
        ('0000009', '1003', 'splash', 'abcdef6', 'C040', 'XYZ-AL00', '4G',
         'CPT', '2020-01-09 12:34:56.78'
         ),  # education; just enough entries to be included.
        ('0000010', '1003', 'splash', 'abcdef6', 'C050', 'XYZ-AL00', '4G',
         'CPT', '2020-01-10 12:34:56.78'
         ),  # no mapping; only one entry for this keyword so will be excluded.
        ('0000001', '1000', 'native', 'abcde10', 'C020', 'JKL-AL00', '4G',
         'CPD', '2020-01-11 12:34:56.78'),  # reading; outside the date range.
    ]

    schema = StructType([
        StructField("did", StringType(), True),
        StructField("adv_id", StringType(), True),
        StructField("media", StringType(), True),
        StructField("slot_id", StringType(), True),
        StructField("spread_app_id", StringType(), True),
        StructField("device_name", StringType(), True),
        StructField("net_type", StringType(), True),
        StructField("price_model", StringType(), True),
        StructField("action_time", StringType(), True)
    ])

    return spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
class VPDelaysCalculator:
    """Class to handle calculations and updates to the VPDelays table
  """

    TableSchema = StructType([
        StructField("RouteId", StringType(), True),
        StructField("TripId", StringType(), False),
        StructField("StopId", StringType(), False),
        StructField("StopName", StringType(), True),
        StructField("StopLat", DoubleType(), False),
        StructField("StopLon", DoubleType(), False),
        StructField("SchedDT", TimestampType(), False),
        StructField("EstLat", DoubleType(), False),
        StructField("EstLon", DoubleType(), False),
        StructField("EstDT", TimestampType(), False),
        StructField("EstDist", DoubleType(), False),
        StructField("EstDelay", DoubleType(), False)
    ])

    VehPos = namedtuple("VehPos", "trip_id DT coords")

    def __init__(self, spark, pqDate, dfStopTimes, dfVehPos):
        """Initializes the instance

    Args:
      spark: Spark Session object
      pqDate: a date of the Parquet file with vehicle positions
      dfStopTimes: dataframe of stop times from the schedule
      dfVehPos: dataframe of vehicle positions
    """

        self.spark = spark
        self.pqDate = pqDate
        self.dfStopTimes = dfStopTimes
        self.dfVehPos = dfVehPos

    def create_result_df(self):
        """Creates the delays dataframe from the schedule and vehicle positions
    dataframes
    """

        rddStopTimes = self.dfStopTimes.rdd \
          .map(lambda rec: (rec.trip_id, tuple(rec))) \
          .groupByKey()

        rddVehPos = self.dfVehPos.rdd \
          .map(lambda rec: (rec.TripId, tuple(rec))) \
          .groupByKey()

        pqDate = self.pqDate
        cutoffs = _compute_date_cutoffs(pqDate, pytz.utc)
        rddVPDelays = rddStopTimes.join(rddVehPos) \
          .flatMap(lambda keyTpl1Tpl2:
            VPDelaysCalculator._process_joined(pqDate, cutoffs, keyTpl1Tpl2))

        return self.spark.createDataFrame(rddVPDelays, self.TableSchema)

    @staticmethod
    def _process_joined(pqDate, cutoffs, keyTpl1Tpl2):
        stopTimeRecs = keyTpl1Tpl2[1][0]
        vehPosRecs = keyTpl1Tpl2[1][1]

        stopTimeLst = []
        for stopTimeRec in stopTimeRecs:
            try:
                dt = utils.sched_time_to_dt(stopTimeRec[2], pqDate)
                dt = Settings.MBTA_TZ.localize(dt).astimezone(pytz.UTC)
            except (pytz.exceptions.AmbiguousTimeError,
                    pytz.exceptions.NonExistentTimeError):
                continue
            stopTimeLst.append(
                dict(tripId=stopTimeRec[0],
                     stopId=stopTimeRec[1],
                     stopSeq=stopTimeRec[3],
                     stopName=stopTimeRec[4],
                     routeId=stopTimeRec[8],
                     coords=shapelib.Point.FromLatLng(stopTimeRec[6],
                                                      stopTimeRec[7]),
                     schedDT=dt.replace(tzinfo=None)))

        vehPosLst = []
        for vehPosRec in vehPosRecs:
            vehPosLst.append(
                VPDelaysCalculator.VehPos(
                    vehPosRec[3],
                    datetime.utcfromtimestamp(vehPosRec[1].timestamp()),
                    shapelib.Point.FromLatLng(vehPosRec[4], vehPosRec[5])))

        return VPDelaysCalculator._calc_delays(cutoffs, stopTimeLst, vehPosLst)

    @staticmethod
    def _calc_delays(cutoffs, stopTimeLst, vehPosLst):
        ret = []
        mxdval = 1e10 if Settings.MaxAbsDelay <= 0 else Settings.MaxAbsDelay

        prevEstDT = None
        for stopTime in sorted(stopTimeLst, key=lambda x: x['schedDT']):
            stopCoords = stopTime["coords"]
            stopLatLon = stopCoords.ToLatLng()

            curClosest = None
            curDist = 1e10
            curSchedDT = None
            for vp in vehPosLst:
                dist = stopCoords.GetDistanceMeters(vp.coords)
                if dist < curDist:
                    # adjust the scheduled time for possible date mismatches
                    schedDT = stopTime["schedDT"]
                    daysDiff = round(
                        (schedDT - vp.DT).total_seconds() / (24 * 3600))
                    schedDT -= timedelta(days=daysDiff)
                    # ignore datapoints where the absolute value of delay is too large
                    if -mxdval < (vp.DT - schedDT).total_seconds() < mxdval:
                        curDist = dist
                        curClosest = vp
                        curSchedDT = schedDT

            if curClosest and cutoffs[0] < schedDT < cutoffs[1]:
                vpLatLon = curClosest.coords.ToLatLng()
                ret.append((stopTime["routeId"], stopTime["tripId"],
                            stopTime["stopId"], stopTime["stopName"],
                            stopLatLon[0], stopLatLon[1], curSchedDT,
                            vpLatLon[0], vpLatLon[1], curClosest.DT, curDist,
                            (curClosest.DT - curSchedDT).total_seconds()))

                if prevEstDT:
                    diffWPrev = (curClosest.DT - prevEstDT).total_seconds()
                    if diffWPrev < -120:
                        return []
                prevEstDT = curClosest.DT
        return ret
# COMMAND ----------

# MAGIC %md
# MAGIC <h2><img src="https://files.training.databricks.com/images/105/logo_spark_tiny.png"> Step 2</h2>
# MAGIC <h3>A Schema for parsing JSON</h3>
# MAGIC
# MAGIC Becase the schema is so complex, it is being provided for you.
# MAGIC
# MAGIC Simply run the following cell and proceed to the next step.

# COMMAND ----------

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, ArrayType

twitSchema = StructType([
    StructField("hashTags", ArrayType(StringType(), False), True),
    StructField("text", StringType(), True),
    StructField("userScreenName", StringType(), True),
    StructField("id", LongType(), True),
    StructField("createdAt", LongType(), True),
    StructField("retweetCount", IntegerType(), True),
    StructField("lang", StringType(), True),
    StructField("favoriteCount", IntegerType(), True),
    StructField("user", StringType(), True),
    StructField(
        "place",
        StructType([
            StructField("coordinates", StringType(), True),
            StructField("name", StringType(), True),
            StructField("placeType", StringType(), True),
            StructField("fullName", StringType(), True),