Beispiel #1
0
    current_str_time = None
    if len(sys.argv) == 2:
        if sys.argv[1] == "dev":
            on_dev = True
        else:
            current_str_time = sys.argv[1]
    else:
        current_str_time = sys.argv[1]
        on_dev = True

    spark = ss_util.get_spark_session(app_name="xxx_user_activity_sum_generator_Linda",
                                      configs={"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
                                               "spark.sql.hive.convertMetastoreParquet": "false"},
                                      enable_hive=True)

    logger = ss_util.get_logger(spark, "xxx_user_activity_sum")

    config_file = "./recommender.json"
    config = json_util.load_json(config_file)
    config_etl = config.get("etl")
    hive_db = config_etl.get("hive_db")
    hive_xxx_user_activity_data = config_etl.get("hive_xxx_user_activity_data")
    hbase_xxx_user_activity_sum = config_etl.get("hbase_xxx_user_activity_sum")
    hbase_xxx_user_updated = config_etl.get("hbase_xxx_user_updated")
    hb_port = config_etl.get("hbase_port")
    hb_host = config_etl.get("hbase_host")
    schedule_time_delta = int(config_etl.get("schedule_time_delta"))

    end_time = datetime.now() if not current_str_time else datetime.strptime(current_str_time, '%Y-%m-%d-%H')
    start_time = end_time - timedelta(hours=schedule_time_delta)
    start_str_time = start_time.strftime("%Y-%m-%d-%H")
Beispiel #2
0
        hudi_util = HudiUtil(spark=spark, hive_db="default", hive_tb=hive_tb, path=s3_path,
                             partition_field=partition_field, record_key=record_key, hudi_options=write_hudi_options)
        hudi_util.update_hudi(df=df)

if __name__ == "__main__":

    spark = ss_util.get_spark_session(app_name="initial_xxxx_user_activity_sum_generator_Linda",
                                      configs={"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
                                               "spark.sql.hive.convertMetastoreParquet": "false",
                                               "spark.hadoop.hive.exec.dynamic.partition": "true",
                                               "spark.hadoop.hive.exec.dynamic.partition.mode": "nonstrict",
                                               "spark.sql.sources.partitionOverwriteMode": "dynamic"
                                               },
                                      enable_hive=True)

    logger = ss_util.get_logger(spark, "event_time_preference_data")

    config_file = "./recommender.json"
    config = json_util.load_json(config_file)
    config_etl = config.get("etl")
    hive_db = config_etl.get("hive_db")
    src_mysql_db = config_etl.get("src_mysql_db")
    hive_xxxx_user_activity_data = config_etl.get("hive_xxxx_user_activity_data")
    hive_xxxx_event_data = config_etl.get("hive_xxxx_event_data")
    hive_xxxx_zm_event_schedule =  config_etl.get("hive_xxxx_zm_event_schedule")
    hive_tables_schema = config_etl.get("hive_tables_schema")

    hive_xxxx_event_time_preference = config_etl.get("hive_event_time_preference")
    sc = spark.sparkContext
    sqlContext = HiveContext(sc)
    data_processor = EventTimeData(sqlContext,