コード例 #1
0
ファイル: hbase_v2.py プロジェクト: primitivenen/data
    """
    current_time: "2021-02-24-01"
    """
    on_dev = False
    current_str_time = None
    if len(sys.argv) == 2:
        if sys.argv[1] == "dev":
            on_dev = True
        else:
            current_str_time = sys.argv[1]
    else:
        current_str_time = sys.argv[1]
        on_dev = True

    spark = ss_util.get_spark_session(app_name="xxx_user_activity_sum_generator_Linda",
                                      configs={"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
                                               "spark.sql.hive.convertMetastoreParquet": "false"},
                                      enable_hive=True)

    logger = ss_util.get_logger(spark, "xxx_user_activity_sum")

    config_file = "./recommender.json"
    config = json_util.load_json(config_file)
    config_etl = config.get("etl")
    hive_db = config_etl.get("hive_db")
    hive_xxx_user_activity_data = config_etl.get("hive_xxx_user_activity_data")
    hbase_xxx_user_activity_sum = config_etl.get("hbase_xxx_user_activity_sum")
    hbase_xxx_user_updated = config_etl.get("hbase_xxx_user_updated")
    hb_port = config_etl.get("hbase_port")
    hb_host = config_etl.get("hbase_host")
    schedule_time_delta = int(config_etl.get("schedule_time_delta"))
コード例 #2
0
        record_key = tmp_tb["record_key"]
        hive_db = tmp_tb["hive_db"]
        hive_tb = tmp_tb["hive_tb"]
        write_hudi_options: Dict[str, str] = {
            'hoodie.datasource.write.precombine.field': record_key
        }
        hudi_util = HudiUtil(spark=spark, hive_db="default", hive_tb=hive_tb, path=s3_path,
                             partition_field=partition_field, record_key=record_key, hudi_options=write_hudi_options)
        hudi_util.update_hudi(df=df)

if __name__ == "__main__":

    spark = ss_util.get_spark_session(app_name="initial_xxxx_user_activity_sum_generator_Linda",
                                      configs={"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
                                               "spark.sql.hive.convertMetastoreParquet": "false",
                                               "spark.hadoop.hive.exec.dynamic.partition": "true",
                                               "spark.hadoop.hive.exec.dynamic.partition.mode": "nonstrict",
                                               "spark.sql.sources.partitionOverwriteMode": "dynamic"
                                               },
                                      enable_hive=True)

    logger = ss_util.get_logger(spark, "event_time_preference_data")

    config_file = "./recommender.json"
    config = json_util.load_json(config_file)
    config_etl = config.get("etl")
    hive_db = config_etl.get("hive_db")
    src_mysql_db = config_etl.get("src_mysql_db")
    hive_xxxx_user_activity_data = config_etl.get("hive_xxxx_user_activity_data")
    hive_xxxx_event_data = config_etl.get("hive_xxxx_event_data")
    hive_xxxx_zm_event_schedule =  config_etl.get("hive_xxxx_zm_event_schedule")
    hive_tables_schema = config_etl.get("hive_tables_schema")