current_str_time = None if len(sys.argv) == 2: if sys.argv[1] == "dev": on_dev = True else: current_str_time = sys.argv[1] else: current_str_time = sys.argv[1] on_dev = True spark = ss_util.get_spark_session(app_name="xxx_user_activity_sum_generator_Linda", configs={"spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.sql.hive.convertMetastoreParquet": "false"}, enable_hive=True) logger = ss_util.get_logger(spark, "xxx_user_activity_sum") config_file = "./recommender.json" config = json_util.load_json(config_file) config_etl = config.get("etl") hive_db = config_etl.get("hive_db") hive_xxx_user_activity_data = config_etl.get("hive_xxx_user_activity_data") hbase_xxx_user_activity_sum = config_etl.get("hbase_xxx_user_activity_sum") hbase_xxx_user_updated = config_etl.get("hbase_xxx_user_updated") hb_port = config_etl.get("hbase_port") hb_host = config_etl.get("hbase_host") schedule_time_delta = int(config_etl.get("schedule_time_delta")) end_time = datetime.now() if not current_str_time else datetime.strptime(current_str_time, '%Y-%m-%d-%H') start_time = end_time - timedelta(hours=schedule_time_delta) start_str_time = start_time.strftime("%Y-%m-%d-%H")
hudi_util = HudiUtil(spark=spark, hive_db="default", hive_tb=hive_tb, path=s3_path, partition_field=partition_field, record_key=record_key, hudi_options=write_hudi_options) hudi_util.update_hudi(df=df) if __name__ == "__main__": spark = ss_util.get_spark_session(app_name="initial_xxxx_user_activity_sum_generator_Linda", configs={"spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.sql.hive.convertMetastoreParquet": "false", "spark.hadoop.hive.exec.dynamic.partition": "true", "spark.hadoop.hive.exec.dynamic.partition.mode": "nonstrict", "spark.sql.sources.partitionOverwriteMode": "dynamic" }, enable_hive=True) logger = ss_util.get_logger(spark, "event_time_preference_data") config_file = "./recommender.json" config = json_util.load_json(config_file) config_etl = config.get("etl") hive_db = config_etl.get("hive_db") src_mysql_db = config_etl.get("src_mysql_db") hive_xxxx_user_activity_data = config_etl.get("hive_xxxx_user_activity_data") hive_xxxx_event_data = config_etl.get("hive_xxxx_event_data") hive_xxxx_zm_event_schedule = config_etl.get("hive_xxxx_zm_event_schedule") hive_tables_schema = config_etl.get("hive_tables_schema") hive_xxxx_event_time_preference = config_etl.get("hive_event_time_preference") sc = spark.sparkContext sqlContext = HiveContext(sc) data_processor = EventTimeData(sqlContext,