""" current_time: "2021-02-24-01" """ on_dev = False current_str_time = None if len(sys.argv) == 2: if sys.argv[1] == "dev": on_dev = True else: current_str_time = sys.argv[1] else: current_str_time = sys.argv[1] on_dev = True spark = ss_util.get_spark_session(app_name="xxx_user_activity_sum_generator_Linda", configs={"spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.sql.hive.convertMetastoreParquet": "false"}, enable_hive=True) logger = ss_util.get_logger(spark, "xxx_user_activity_sum") config_file = "./recommender.json" config = json_util.load_json(config_file) config_etl = config.get("etl") hive_db = config_etl.get("hive_db") hive_xxx_user_activity_data = config_etl.get("hive_xxx_user_activity_data") hbase_xxx_user_activity_sum = config_etl.get("hbase_xxx_user_activity_sum") hbase_xxx_user_updated = config_etl.get("hbase_xxx_user_updated") hb_port = config_etl.get("hbase_port") hb_host = config_etl.get("hbase_host") schedule_time_delta = int(config_etl.get("schedule_time_delta"))
record_key = tmp_tb["record_key"] hive_db = tmp_tb["hive_db"] hive_tb = tmp_tb["hive_tb"] write_hudi_options: Dict[str, str] = { 'hoodie.datasource.write.precombine.field': record_key } hudi_util = HudiUtil(spark=spark, hive_db="default", hive_tb=hive_tb, path=s3_path, partition_field=partition_field, record_key=record_key, hudi_options=write_hudi_options) hudi_util.update_hudi(df=df) if __name__ == "__main__": spark = ss_util.get_spark_session(app_name="initial_xxxx_user_activity_sum_generator_Linda", configs={"spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.sql.hive.convertMetastoreParquet": "false", "spark.hadoop.hive.exec.dynamic.partition": "true", "spark.hadoop.hive.exec.dynamic.partition.mode": "nonstrict", "spark.sql.sources.partitionOverwriteMode": "dynamic" }, enable_hive=True) logger = ss_util.get_logger(spark, "event_time_preference_data") config_file = "./recommender.json" config = json_util.load_json(config_file) config_etl = config.get("etl") hive_db = config_etl.get("hive_db") src_mysql_db = config_etl.get("src_mysql_db") hive_xxxx_user_activity_data = config_etl.get("hive_xxxx_user_activity_data") hive_xxxx_event_data = config_etl.get("hive_xxxx_event_data") hive_xxxx_zm_event_schedule = config_etl.get("hive_xxxx_zm_event_schedule") hive_tables_schema = config_etl.get("hive_tables_schema")