Beispiel #1
0
def run(spark):
    """Indexes Protobuf files by updating the S3Prefixes and VehPosPb tables

  Args:
    spark: Spark Session object
  """

    log = utils.get_logger()

    with DBConnCommonQueries() as conn:
        dbtables.create_if_not_exists(conn, dbtables.S3Prefixes)
        dbtables.create_if_not_exists(conn, dbtables.VehPosPb)

    pfxs = explore_s3prefixes()
    s3Mgr = s3.S3Mgr()
    for pfx in pfxs:
        fullPfx = '/'.join(("pb", "VehiclePos", pfx))
        keys = s3Mgr.fetch_keys(fullPfx)
        if len(keys) > 0:
            log.info("PROCESSING %d KEYS FOR %s", len(keys), pfx)
            file_list = spark.sparkContext.parallelize(keys)
            file_list \
              .map(dbtables.VehPosPb.build_tuple_from_protobuf) \
              .foreachPartition(push_vehpospb_dbtpls)
            log.info("PROCESSED %d KEYS FOR %s", len(keys), pfx)
        tpl = (pfx, len(keys))

        with DBConn() as conn:
            dbtables.S3Prefixes.insert_values(conn, pfx, len(keys))
            conn.commit()
        log.info("PUSHED S3Prefix %s", str(tpl))
def run(spark):
    """Updates the vehicle positions database table

  Args:
    spark: Spark Session object
  """

    log = utils.get_logger()

    with DBConnCommonQueries() as conn:
        dbtables.create_if_not_exists(conn, dbtables.VehPos)

    with DBConn() as conn:
        keys = dbtables.VehPosPb.select_protobuf_keys_not_invehpos(conn)
    log.info("Got %d keys", len(keys))

    step = 1000
    for i in range(0, len(keys), step):
        lower = i
        upper = i + step if i + step < len(keys) else len(keys)
        keysSubrange = keys[lower:upper]
        records = spark.sparkContext \
          .parallelize(keysSubrange) \
          .flatMap(dbtables.VehPos.build_db_tuples_from_pb) \
          .map(lambda tpl: ((tpl[1], tpl[3]), tpl)) \
          .reduceByKey(lambda x, y: x)

        records.foreachPartition(push_vehpos_db)
        log.info("Inserted records for keys  %d-%d", lower, upper - 1)

        spark.sparkContext \
          .parallelize(keysSubrange) \
          .foreachPartition(set_vehpospb_invehpos)
        log.info("Updated IsInVehPos for keys %d-%d", lower, upper - 1)
def run(spark):
  """Updates Parquet files in S3 and the PqDate table

  Args:
    spark: Spark Session object
  """

  log = utils.get_logger()

  with DBConnCommonQueries() as conn:
    dbtables.create_if_not_exists(conn, dbtables.PqDates)

  targetDates = fetch_parquet_dts()
  for targetDate in targetDates:
    keys = fetch_keys_for_date(targetDate)
    log.info("Got %d keys of %s", len(keys), str(targetDate))

    if len(keys) > 0:
      rddKeys = spark.sparkContext \
        .parallelize(keys) \
        .map(lambda x: (x, x)) \
        .partitionBy(Settings.NumPartitions) \
        .map(lambda x: x[0])

      rddVP = rddKeys \
        .flatMap(dbtables.VehPos.build_df_tuples_from_pb) \
        .map(lambda tpl: ((tpl[1], tpl[3]), tpl)) \
        .reduceByKey(lambda x, y: x).map(lambda x: x[1])

      schema = StructType([
        StructField("RouteId", StringType(), True),
        StructField("DT", TimestampType(), False),
        StructField("VehicleId", StringType(), False),
        StructField("TripId", StringType(), False),
        StructField("Lat", DoubleType(), False),
        StructField("Lon", DoubleType(), False),
        StructField("Status", IntegerType(), True),
        StructField("StopSeq", IntegerType(), True),
        StructField("StopId", StringType(), True),
      ])
      dfVP = spark.createDataFrame(rddVP, schema)
      log.info("Created dataframe for %d keys of %s", len(keys), str(targetDate))

      pqKey = targetDate.strftime("%Y%m%d")
      pqKey = '/'.join(["parquet", "VP-" + pqKey])
      pqKey = "s3a://alxga-insde/%s" % pqKey
      dfVP.write.format("parquet").mode("overwrite").save(pqKey)
      log.info("Written to Parquet %d keys of %s", len(keys), str(targetDate))
      numRecs = dfVP.count()
    else:
      numRecs = 0

    with DBConn() as conn:
      dbtables.PqDates.insert_values(conn, targetDate, len(keys), numRecs)
      conn.commit()
def run(spark):
  """Combines GTFS schedule feed with vehicle positions Parquet files
  and updates the VPDelays and HlyDelays tables

  Args:
    spark: Spark Session object
  """

  log = utils.get_logger()

  with DBConnCommonQueries() as conn:
    dbtables.create_if_not_exists(conn, dbtables.VPDelays)
    dbtables.create_if_not_exists(conn, dbtables.HlyDelays)

  feedDescs = GTFSFetcher.read_feed_descs()
  curFeedDesc = None
  dfStopTimes = None
  feedRequiredFiles = ["stops.txt", "stop_times.txt", "trips.txt"]

  gtfsFetcher = GTFSFetcher(spark)
  # with DBConn() as conn:
  #   entriesToProcess = dbtables.PqDates \
  #     .select_pqdates_not_in_delays(conn, 'NOT IsInHlyDelays')
  entriesToProcess = [date(2020, 8, 20)]
  for targetDate in entriesToProcess:
    if dfStopTimes is None or not curFeedDesc.includes_date(targetDate):
      curFeedDesc = None
      dfStopTimes = None
      for fd in feedDescs:
        if fd.includes_date(targetDate) and fd.includes_files(feedRequiredFiles):
          curFeedDesc = fd
          dfStopTimes = gtfsFetcher.read_stop_times(curFeedDesc)
          log.info('USING FEED "%s" for %s', curFeedDesc.version,
                   targetDate.strftime("%Y-%m-%d"))
          break
    else:
      log.info('RE-USING FEED "%s" for %s', curFeedDesc.version,
               targetDate.strftime("%Y-%m-%d"))

    if dfStopTimes:
      dfVehPos = read_vp_parquet(spark, targetDate)

      calcVPDelays = \
        VPDelaysCalculator(spark, targetDate, dfStopTimes, dfVehPos)
      dfVPDelays = calcVPDelays.create_result_df()

      with DBConn() as conn:
        dbtables.VPDelays.delete_for_parquet(conn, targetDate)
        conn.commit()
      calcVPDelays.update_db(dfVPDelays)

      calcHlyDelays = HlyDelaysCalculator(spark, dfVPDelays)
      dfHlyDelays = calcHlyDelays.create_result_df().persist()
      dfGrpRoutes = calcHlyDelays.group_routes(dfHlyDelays)
      dfGrpStops = calcHlyDelays.group_stops(dfHlyDelays)
      dfGrpAll = calcHlyDelays.group_all(dfHlyDelays)
      dfHlyDelaysBus = dfHlyDelays.filter(dfHlyDelays.RouteId.rlike("^[0-9]"))
      dfHlyDelaysTrain = dfHlyDelays.filter(~dfHlyDelays.RouteId.rlike("^[0-9]"))
      dfGrpStopsBus = calcHlyDelays.group_stops(dfHlyDelaysBus)
      dfGrpAllBus = calcHlyDelays.group_all(dfHlyDelaysBus)
      dfGrpStopsTrain = calcHlyDelays.group_stops(dfHlyDelaysTrain)
      dfGrpAllTrain = calcHlyDelays.group_all(dfHlyDelaysTrain)

      with DBConn() as conn:
        dbtables.HlyDelays.delete_for_parquet(conn, targetDate)
        conn.commit()

      calcHlyDelays.update_db(dfHlyDelays, targetDate)
      calcHlyDelays.update_db(dfGrpRoutes, targetDate)
      calcHlyDelays.update_db(dfGrpStops, targetDate)
      calcHlyDelays.update_db(dfGrpAll, targetDate)
      calcHlyDelays.update_db(dfGrpStopsBus, targetDate, "ALLBUSES")
      calcHlyDelays.update_db(dfGrpAllBus, targetDate, "ALLBUSES")
      calcHlyDelays.update_db(dfGrpStopsTrain, targetDate, "ALLTRAINS")
      calcHlyDelays.update_db(dfGrpAllTrain, targetDate, "ALLTRAINS")
def run(spark):
    """Combines GTFS schedule feed with vehicle positions Parquet files
  and updates the VPDelays and HlyDelays tables

  Args:
    spark: Spark Session object
  """

    log = utils.get_logger()

    with DBConnCommonQueries() as conn:
        dbtables.create_if_not_exists(conn, dbtables.HlyDelays)

    feedDescs = GTFSFetcher.read_feed_descs()
    curFeedDesc = None
    dfStopTimes = None
    feedRequiredFiles = ["stops.txt", "stop_times.txt", "trips.txt"]

    gtfsFetcher = GTFSFetcher(spark)
    with DBConn() as conn:
        entriesToProcess = dbtables.PqDates \
          .select_pqdates_not_in_delays(conn, 'NOT IsInHlyDelaysS3')
    for targetDate in entriesToProcess:
        if dfStopTimes is None or not curFeedDesc.includes_date(targetDate):
            curFeedDesc = None
            dfStopTimes = None
            for fd in feedDescs:
                if fd.includes_date(targetDate) and fd.includes_files(
                        feedRequiredFiles):
                    curFeedDesc = fd
                    dfStopTimes = gtfsFetcher.read_stop_times(curFeedDesc)
                    log.info('USING FEED "%s" for %s', curFeedDesc.version,
                             targetDate.strftime("%Y-%m-%d"))
                    break
        else:
            log.info('RE-USING FEED "%s" for %s', curFeedDesc.version,
                     targetDate.strftime("%Y-%m-%d"))

        if dfStopTimes:
            dfVehPos = read_vp_parquet(spark, targetDate)

            calcVPDelays = \
              VPDelaysCalculator(spark, targetDate, dfStopTimes, dfVehPos)
            dfVPDelays = calcVPDelays.create_result_df()

            cols_order = [
                'RouteId', 'StopName', 'DateEST', 'HourEST', 'AvgDelay',
                'AvgDist', 'Cnt'
            ]
            calcHlyDelays = HlyDelaysCalculator(spark, dfVPDelays)
            dfHlyDelays = calcHlyDelays.create_result_df().persist()
            dfGrpRoutes = calcHlyDelays.group_routes(dfHlyDelays) \
              .withColumn('StopName', F.lit('ALLSTOPS'))
            dfGrpStops = calcHlyDelays.group_stops(dfHlyDelays) \
              .withColumn('RouteId', F.lit('ALLROUTES'))
            dfGrpAll = calcHlyDelays.group_all(dfHlyDelays) \
              .withColumn('RouteId', F.lit('ALLROUTES')) \
              .withColumn('StopName', F.lit('ALLSTOPS'))
            dfHlyDelaysBus = dfHlyDelays.filter(
                dfHlyDelays.RouteId.rlike("^[0-9]"))
            dfHlyDelaysTrain = dfHlyDelays.filter(
                ~dfHlyDelays.RouteId.rlike("^[0-9]"))
            dfGrpStopsBus = calcHlyDelays.group_stops(dfHlyDelaysBus) \
              .withColumn('RouteId', F.lit('ALLBUSES'))
            dfGrpAllBus = calcHlyDelays.group_all(dfHlyDelaysBus) \
              .withColumn('RouteId', F.lit('ALLBUSES')) \
              .withColumn('StopName', F.lit('ALLSTOPS'))
            dfGrpStopsTrain = calcHlyDelays.group_stops(dfHlyDelaysTrain) \
              .withColumn('RouteId', F.lit('ALLTRAINS'))
            dfGrpAllTrain = calcHlyDelays.group_all(dfHlyDelaysTrain) \
              .withColumn('RouteId', F.lit('ALLTRAINS')) \
              .withColumn('StopName', F.lit('ALLSTOPS'))

            dfAllHly = dfHlyDelays[cols_order] \
              .union(dfGrpRoutes[cols_order]) \
              .union(dfGrpStops[cols_order]) \
              .union(dfGrpAll[cols_order]) \
              .union(dfGrpStopsBus[cols_order]) \
              .union(dfGrpAllBus[cols_order]) \
              .union(dfGrpStopsTrain[cols_order]) \
              .union(dfGrpAllTrain[cols_order])

            with DBConnCommonQueries() as conn:
                dbtables.create_if_not_exists(conn, dbtables.RouteStops)
                data = dfAllHly[['RouteId', 'StopName']] \
                  .distinct() \
                  .collect()
                dbtables.RouteStops.insert_values(conn, data)
                conn.commit()

            calcHlyDelays.update_s3(dfAllHly, targetDate)

            with DBConn() as conn:
                dbtables.PqDates.update_in_delays(conn, targetDate,
                                                  "IsInHlyDelaysS3")
                conn.commit()